import {AssemblyInstructionInfo} from '../base.js'; export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined { if (!opcode) return; switch (opcode) { case "abs": return { "html": "For more information, visit abs(fp) , abs(fp16) , abs(int) .

Floating Point Instructions: abs

\n\n\n

Absolute value.

Syntax

abs{.ftz}.f32  d, a;\nabs.f64        d, a;\n

Description

Take the absolute value of a and store the result in d.

Semantics

d = |a|;\n

Notes

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

abs.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

abs.f64 supports subnormal numbers.

abs.f32 flushes subnormal inputs and results to sign-preserving zero.

For abs.f32, NaN input yields unspecified NaN. For abs.f64, NaN input is passed\nthrough unchanged. Future implementations may comply with the IEEE 754 standard by preserving\npayload and modifying only the sign bit.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

abs.f32 supported on all target architectures.

abs.f64 requires sm_13 or higher.

Examples

abs.ftz.f32  x,f0;\n

Half Precision Floating Point Instructions: abs

\n\n\n

Absolute value

Syntax

abs{.ftz}.f16    d, a;\nabs{.ftz}.f16x2  d, a;\nabs.bf16         d, a;\nabs.bf16x2       d, a;\n

Description

Take absolute value of a and store the result in d.

For .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\nfrom the source operand. Absolute values of half-word operands are then computed in parallel to\nproduce .f16x2 or .bf16x2 result in destination.

For .f16 instruction type, operands d and a have .f16 or .b16 type. For\n.f16x2 instruction type, operands d and a have .f16x2 or .b32 type. For\n.bf16 instruction type, operands d and a have .b16 type. For .bf16x2 instruction\ntype, operands d and a have .b32 type.

Semantics

if (type == f16 || type == bf16) {\n    d = |a|;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = |fA[i]|;\n    }\n}\n

Notes

Subnormal numbers:: By default, subnormal numbers are supported.\nabs.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n

NaN inputs yield an unspecified NaN. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.

PTX ISA Notes

Introduced in PTX ISA version 6.5.

abs.bf16 and abs.bf16x2 introduced in PTX ISA 7.0.

Target ISA Notes

Requires sm_53 or higher.

abs.bf16 and abs.bf16x2 requires architecture sm_80 or higher.

Examples

abs.ftz.f16  x,f0;\nabs.bf16     x,b0;\nabs.bf16x2   x1,b1;\n

Integer Arithmetic Instructions: abs

\n\n\n

Absolute value.

Syntax

abs.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n

Description

Take the absolute value of a and store it in d.

Semantics

d = |a|;\n

Notes

Only for signed integers.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

abs.s32  r0,a;\n

", "tooltip": "=====Floating Point Instructions: abs\n\n\n\nAbsolute value.\n\nSyntax\n\nabs{.ftz}.f32 d, a;\n\nabs.f64 d, a;\n\nDescription\n\nTake the absolute value of a and store the result in d.\n\nSemantics\n\nd = |a|;\n\nNotes\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nabs.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xabs.f64 supports subnormal numbers.\n\nabs.f32 flushes subnormal inputs and results to sign-pres...\n\n=====Half Precision Floating Point Instructions: abs\n\n\n\nAbsolute value\n\nSyntax\n\nabs{.ftz}.f16 d, a;\n\nabs{.ftz}.f16x2 d, a;\n\nabs.bf16 d, a;\n\nabs.bf16x2 d, a;\n\nDescription\n\nTake absolute value of a and store the result in d.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\n\nfrom the source operand. Absolute values of half-word operands are then computed in parallel to\n\nproduce .f16x2 or .bf16x2 result in...\n\n=====Integer Arithmetic Instructions: abs\n\n\n\nAbsolute value.\n\nSyntax\n\nabs.type d, a;\n\n.type = { .s16, .s32, .s64 };\n\nDescription\n\nTake the absolute value of a and store it in d.\n\nSemantics\n\nd = |a|;\n\nNotes\n\nOnly for signed integers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nabs.s32 r0,a;\n\n... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs" }; case "activemask": return { "html": "For more information, visit activemask .

Parallel Synchronization and Communication Instructions: activemask

\n\n\n

Queries the active threads within a warp.

Syntax

activemask.b32 d;\n

Description

activemask queries predicated-on active threads from the executing warp and sets the destination\nd with 32-bit integer mask where bit position in the mask corresponds to the thread\u2019s\nlaneid.

Destination d is a 32-bit destination register.

An active thread will contribute 1 for its entry in the result and exited or inactive or\npredicated-off thread will contribute 0 for its entry in the result.

PTX ISA Notes

Introduced in PTX ISA version 6.2.

Target ISA Notes

Requires sm_30 or higher.

Examples

activemask.b32  %r1;\n

", "tooltip": "Queries the active threads within a warp.\n\nSyntax\n\nactivemask.b32 d;\n\nDescription\n\nactivemask queries predicated-on active threads from the executing warp and sets the destination\n\nd with 32-bit integer mask where bit position in the mask corresponds to the thread\u2019s\n\nlaneid.\n\nDestination d is a 32-bit destination register.\n\nAn active thread will contribute 1 for its entry in the result and exited or inactive or\n\npredicated-off thread will contribute 0 for its entry in the result.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.2.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nactivemask.b32 %r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask" }; case "add": return { "html": "For more information, visit add(fp) , add(fp16) , add(int) , add.cc .

Floating Point Instructions: add

\n\n\n

Add two values.

Syntax

add{.rnd}{.ftz}{.sat}.f32  d, a, b;\nadd{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Performs addition and writes the resulting value into a destination register.

Semantics

d = a + b;\n

Notes

Rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

The default value of rounding modifier is .rn. Note that an add instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. An add instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, mul/add sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

add.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

add.f64 supports subnormal numbers.

add.f32 flushes subnormal inputs and results to sign-preserving zero.

Saturation modifier:

add.sat.f32 clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

add.f32 supported on all target architectures.

add.f64 requires sm_13 or higher.

Rounding modifiers have the following target requirements:

.rn, .rz

available for all targets

.rm, .rp

for add.f64, requires sm_13 or higher.

for add.f32, requires sm_20 or higher.

Examples

@p  add.rz.ftz.f32  f1,f2,f3;\n

Half Precision Floating Point Instructions: add

\n\n\n

Add two values.

Syntax

add{.rnd}{.ftz}{.sat}.f16   d, a, b;\nadd{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nadd{.rnd}.bf16   d, a, b;\nadd{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n

Description

Performs addition and writes the resulting value into a destination register.

For .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then added in parallel to produce .f16x2 or .bf16x2 result\nin destination.

For .f16 instruction type, operands d, a and b have .f16 or .b16 type. For\n.f16x2 instruction type, operands d, a and b have .b32 type. For .bf16\ninstruction type, operands d, a, b have .b16 type. For .bf16x2 instruction type,\noperands d, a, b have .b32 type.

Semantics

if (type == f16 || type == bf16) {\n    d = a + b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = fA[i] + fB[i];\n    }\n}\n

Notes

Rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n

Subnormal numbers:: By default, subnormal numbers are supported.\nadd.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n
Saturation modifier:: add.sat.{f16, f16x2} clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.
\n

PTX ISA Notes

Introduced in PTX ISA version 4.2.

add{.rnd}.bf16 and add{.rnd}.bf16x2 introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_53 or higher.

add{.rnd}.bf16 and add{.rnd}.bf16x2 requires sm_90 or higher.

Examples

// scalar f16 additions\nadd.f16        d0, a0, b0;\nadd.rn.f16     d1, a1, b1;\nadd.bf16       bd0, ba0, bb0;\nadd.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 addition\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nadd.f16x2  p3, p1, p2;   // SIMD f16x2 addition\n\n// SIMD bf16 addition\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nadd.bf16x2  p6, p4, p5;       // SIMD bf16x2 addition\n\n// SIMD fp16 addition\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nadd.f16x2       f2, f0, f1;     // SIMD f16x2 addition\n\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nadd.bf16x2      f5, f3, f4;      // SIMD bf16x2 addition\n

Integer Arithmetic Instructions: add

\n\n\n

Add two values.

Syntax

add.type       d, a, b;\nadd{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64,\n          .u16x2, .s16x2 };\n

Description

Performs addition and writes the resulting value into a destination register.

For .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then added in parallel to produce .u16x2, .s16x2 result in\ndestination.

Operands d, a and b have type .type. For instruction types .u16x2, .s16x2,\noperands d, a and b have type .b32.

Semantics

if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = iA[i] + iB[i];\n    }\n} else {\n    d = a + b;\n}\n

Notes

Saturation modifier:

.sat: limits result to MININT..MAXINT (no overflow) for the size of the operation. Applies only to\n.s32 type.
\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

add.u16x2 and add.s16x2 introduced in PTX ISA version 8.0.

Target ISA Notes

Supported on all target architectures.

add.u16x2 and add.s16x2 require sm_90 or higher.

Examples

@p  add.u32     x,y,z;\n    add.sat.s32 c,c,1;\n    add.u16x2   u,v,w;\n

Extended-Precision Arithmetic Instructions: add.cc

\n\n\n

Add two values with carry-out.

Syntax

add.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n

Description

Performs integer addition and writes the carry-out value into the condition code register.

Semantics

d = a + b;\n

carry-out written to CC.CF

Notes

No integer rounding modifiers.

No saturation.

Behavior is the same for unsigned and signed integers.

PTX ISA Notes

32-bit add.cc introduced in PTX ISA version 1.2.

64-bit add.cc introduced in PTX ISA version 4.3.

Target ISA Notes

32-bit add.cc is supported on all target architectures.

64-bit add.cc requires sm_20 or higher.

Examples

@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n@p  addc.cc.u32  x3,y3,z3;\n@p  addc.u32     x4,y4,z4;\n

", "tooltip": "=====Floating Point Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd{.rnd}{.ftz}{.sat}.f32 d, a, b;\n\nadd{.rnd}.f64 d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nSemantics\n\nd = a + b;\n\nNotes\n\nRounding modifiers:\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds towards negative infinity\n\n.rpmantissa L...\n\n=====Half Precision Floating Point Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd{.rnd}{.ftz}{.sat}.f16 d, a, b;\n\nadd{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nadd{.rnd}.bf16 d, a, b;\n\nadd{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then added in paral...\n\n=====Integer Arithmetic Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd.type d, a, b;\n\nadd{.sat}.s32 d, a, b; // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64,\n\n .u16x2, .s16x2 };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are...\n\n=====Extended-Precision Arithmetic Instructions: add.cc\n\n\n\nAdd two values with carry-out.\n\nSyntax\n\nadd.cc.type d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer addition and writes the carry-out value into the condition code register.\n\nSemantics\n\nd = a + b;\n\ncarry-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit add.cc introduced in PTX ... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add" }; case "addc": return { "html": "For more information, visit addc .

Extended-Precision Arithmetic Instructions: addc

\n\n\n

Add two values with carry-in and optional carry-out.

Syntax

addc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n

Description

Performs integer addition with carry-in and optionally writes the carry-out value into the condition\ncode register.

Semantics

d = a + b + CC.CF;\n

if .cc specified, carry-out written to CC.CF

Notes

No integer rounding modifiers.

No saturation.

Behavior is the same for unsigned and signed integers.

PTX ISA Notes

32-bit addc introduced in PTX ISA version 1.2.

64-bit addc introduced in PTX ISA version 4.3.

Target ISA Notes

32-bit addc is supported on all target architectures.

64-bit addc requires sm_20 or higher.

Examples

@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n@p  addc.cc.u32  x3,y3,z3;\n@p  addc.u32     x4,y4,z4;\n

", "tooltip": "Add two values with carry-in and optional carry-out.\n\nSyntax\n\naddc{.cc}.type d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer addition with carry-in and optionally writes the carry-out value into the condition\n\ncode register.\n\nSemantics\n\nd = a + b + CC.CF;\n\nif .cc specified, carry-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit addc introduced in PTX ISA version 1.2.\n\n64-bit addc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\n32-bit addc is supported on all target architectures.\n\n64-bit addc requires sm_20 or higher.\n\nExamples\n\n@p add.cc.u32 x1,y1,z1; // extended-precision addition of\n\n@p addc.cc.u32 x2,y2,z2; // two 128-bit values\n\n@p addc.cc.u32 x3,y3,z3;\n\n@p addc.u32 x4,y4,z4;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc" }; case "address_size": return { "html": "For more information, visit address_size .

PTX Module Directives: .address_size

\n\n\n

Address size used throughout PTX module.

Syntax

.address_size  address-size\naddress-size = { 32, 64 };\n

Description

Specifies the address size assumed throughout the module by the PTX code and the binary DWARF\ninformation in PTX.

Redefinition of this directive within a module is not allowed. In the presence of separate\ncompilation all modules must specify (or default to) the same address size.

The .address_size directive is optional, but it must immediately follow the .targetdirective if present within a module.

Semantics

If the .address_size directive is omitted, the address size defaults to 32.

PTX ISA Notes

Introduced in PTX ISA version 2.3.

Target ISA Notes

Supported on all target architectures.

Examples

// example directives\n   .address_size 32       // addresses are 32 bit\n   .address_size 64       // addresses are 64 bit\n\n// example of directive placement within a module\n   .version 2.3\n   .target sm_20\n   .address_size 64\n...\n.entry foo () {\n...\n}\n

", "tooltip": "Address size used throughout PTX module.\n\nSyntax\n\n.address_size address-size\n\naddress-size = { 32, 64 };\n\nDescription\n\nSpecifies the address size assumed throughout the module by the PTX code and the binary DWARF\n\ninformation in PTX.\n\nRedefinition of this directive within a module is not allowed. In the presence of separate\n\ncompilation all modules must specify (or default to) the same address size.\n\nThe .address_size directive is optional, but it must immediately follow the .targetdirective if present within a module.\n\nSemantics\n\nIf the .address_size directive is omitted, the address size defaults to 32.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n// example directives\n\n .address_size 32 // addresses are 32 bit\n\n .address_size 64 // addresses are 64 bit\n\n// example of directive placement within a module\n\n .version 2.3\n\n .target sm_20\n\n .address_size 64\n\n...\n\n.entry foo () {\n\n...\n\n}\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-address-size" }; case "aggr_smem_size": return { "html": "For more information, visit aggr_smem_size .

Special Registers: %aggr_smem_size

\n\n\n

Total size of shared memory used by a CTA of a kernel.

Syntax (predefined)

.sreg .u32 %aggr_smem_size;\n

Description

A predefined, read-only special register initialized with total aggregated size of shared memory\nconsisting of the size of user shared memory allocated (statically and dynamically) at launch time\nand the size of shared memory region which is reserved for the NVIDIA system software use.

PTX ISA Notes

Introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_90 or higher.

Examples

mov.u32  %r, %aggr_smem_size;\n

", "tooltip": "Total size of shared memory used by a CTA of a kernel.\n\nSyntax (predefined)\n\n.sreg .u32 %aggr_smem_size;\n\nDescription\n\nA predefined, read-only special register initialized with total aggregated size of shared memory\n\nconsisting of the size of user shared memory allocated (statically and dynamically) at launch time\n\nand the size of shared memory region which is reserved for the NVIDIA system software use.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.1.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmov.u32 %r, %aggr_smem_size;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size" }; case "alias": return { "html": "For more information, visit alias .

Kernel and Function Directives: .alias

\n\n\n

Define an alias to existing function symbol.

Syntax

.alias fAlias, fAliasee;\n

Description

.alias is a module scope directive that defines identifier fAlias to be an alias to function\nspecified by fAliasee.

Both fAlias and fAliasee are non-entry function symbols.

Identifier fAlias is a function declaration without body.

Identifier fAliasee is a function symbol which must be defined in the same module as .alias\ndeclaration. Function fAliasee cannot have .weak linkage.

Prototype of fAlias and fAliasee must match.

Program can use either fAlias or fAlisee identifiers to reference function defined with\nfAliasee.

PTX ISA Notes

.alias directive introduced in PTX ISA 6.3.

Target ISA Notes

.alias directive requires sm_30 or higher.

Examples

.visible .func foo(.param .u32 p) {\n   ...\n}\n.visible .func bar(.param .u32 p);\n.alias bar, foo;\n.entry test()\n{\n      .param .u32 p;\n      ...\n      call foo, (p);       // call foo directly\n       ...\n       .param .u32 p;\n       call bar, (p);        // call foo through alias\n}\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n{\n    .reg .b32 %r1, %r2, %r3;\n    ld.param.b32  %r1, [x];\n    ld.param.b32  %r2, [y];\n    ld.param.b32  %r3, [z];\n    ...\n}\n

", "tooltip": "Define an alias to existing function symbol.\n\nSyntax\n\n.alias fAlias, fAliasee;\n\nDescription\n\n.alias is a module scope directive that defines identifier fAlias to be an alias to function\n\nspecified by fAliasee.\n\nBoth fAlias and fAliasee are non-entry function symbols.\n\nIdentifier fAlias is a function declaration without body.\n\nIdentifier fAliasee is a function symbol which must be defined in the same module as .alias\n\ndeclaration. Function fAliasee cannot have .weak linkage.\n\nPrototype of fAlias and fAliasee must match.\n\nProgram can use either fAlias or fAlisee identifiers to reference function defined with\n\nfAliasee.\n\nPTX ISA Notes\n\n.alias directive introduced in PTX ISA 6.3.\n\nTarget ISA Notes\n\n.alias directive requires sm_30 or higher.\n\nExamples\n\n.visible .func foo(.param .u32 p) {\n\n ...\n\n}\n\n.visible .func bar(.param .u32 p);\n\n.alias bar, foo;\n\n.entry test()\n\n{\n\n .param .u32 p;\n\n ...\n\n call foo, (p); // call foo directly\n\n ...\n\n .param .u32 p;\n\n call bar, (p); // call foo through alias\n\n}\n\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n\n{\n\n .reg .b32 %r1, %r2, %r3;\n\n ld.param.b32 %r1, [x];\n\n ld.param.b32 %r2, [y];\n\n ld.param.b32 %r3, [z];\n\n ...\n\n}\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-alias" }; case "alloca": return { "html": "For more information, visit alloca .

Stack Manipulation Instructions: alloca

\n\n\n

Dynamically allocate memory on stack.

Syntax

alloca.type  ptr, size{, immAlign};\n\n.type = { .u32, .u64 };\n

Description

The alloca instruction dynamically allocates memory on the stack frame of the current function\nand updates the stack pointer accordingly. The returned pointer ptr points to local memory and\ncan be used in the address operand of ld.local and st.local instructions.

If sufficient memory is unavailable for allocation on the stack, then execution of alloca may\nresult in stack overflow. In such cases, attempting to access the allocated memory with ptr will\nresult in undefined program behavior.

The memory allocated by alloca is deallocated in the following ways:

It is automatically deallocated when the function exits.
It can be explicitly deallocated using stacksave and stackrestore instructions:\nstacksave can be used to save the value of stack pointer before executing alloca, and\nstackrestore can be used after alloca to restore stack pointer to the original value which\nwas previously saved with stacksave. Note that accessing deallocated memory after executing\nstackrestore results in undefined behavior.

size is an unsigned value which specifies the amount of memory in number of bytes to be\nallocated on stack. size = 0 may not lead to a valid memory allocation.

Both ptr and size have the same type as the instruction type.

immAlign is a 32-bit value which specifies the alignment requirement in number of bytes for the\nmemory allocated by alloca. It is an integer constant, must be a power of 2 and must not exceed\n2^23. immAlign is an optional argument with default value being 8 which is the minimum\nguaranteed alignment.

Semantics

alloca.type ptr, size, immAlign:\n\na = max(immAlign, frame_align); // frame_align is the minimum guaranteed alignment\n\n// Allocate size bytes of stack memory with alignment a and update the stack pointer.\n// Since the stack grows down, the updated stack pointer contains a lower address.\nstackptr = alloc_stack_mem(size, a);\n\n// Return the new value of stack pointer as ptr. Since ptr is the lowest address of the memory\n// allocated by alloca, the memory can be accessed using ptr up to (ptr + size of allocated memory).\nstacksave ptr;\n

PTX ISA Notes

Introduced in PTX ISA version 7.3.

Preview Feature:: alloca is a preview feature in PTX ISA version 7.3. All details are subject to change with no\nguarantees of backward compatibility on future PTX ISA versions or SM architectures.
\n

Target ISA Notes

alloca requires sm_52 or higher.

Examples

.reg .u32 ra, stackptr, ptr, size;\n\nstacksave.u32 stackptr;     // Save the current stack pointer\nalloca ptr, size, 8;        // Allocate stack memory\nst.local.u32 [ptr], ra;     // Use the allocated stack memory\nstackrestore.u32 stackptr;  // Deallocate memory by restoring the stack pointer\n

", "tooltip": "Dynamically allocate memory on stack.\n\nSyntax\n\nalloca.type ptr, size{, immAlign};\n\n.type = { .u32, .u64 };\n\nDescription\n\nThe alloca instruction dynamically allocates memory on the stack frame of the current function\n\nand updates the stack pointer accordingly. The returned pointer ptr points to local memory and\n\ncan be used in the address operand of ld.local and st.local instructions.\n\nIf sufficient memory is unavailable for allocation on the stack, then execution of alloca may\n\nresult in stack overflow. In such cases, attempting to access the allocated memory with ptr will\n\nresult in undefined program behavior.\n\nThe memory allocated by alloca is deallocated in the following ways:\n\nIt is automatically deallocated when the function exits.\n\nIt can be explicitly deallocated using stacksave and stackrestore instructions:\n\nstacksave can be used to save the value of stack pointer before executing alloca, and\n\nstackrestore can be used after alloca to restore stack pointer to the original value which\n\nwas previously saved with stacksave. Note that accessing deallocated memory after executing\n\nstackrestore results in undefined behavior.\n\nsize is an unsigned value which specifies the amount of memory in number of bytes to be\n\nallocated on stack. size = 0 may not lead to a valid memory allocation.\n\nBoth ptr and size have the same type as the instruction type.\n\nimmAlign is a 32-bit value which specifies the alignment requirement in number of bytes for the\n\nmemory allocated by alloca. It is an integer constant, must be a power of 2 and must not exceed\n\n2^23. immAlign is an optional argument with default value being 8 which is the minimum\n\nguaranteed alignment.\n\nSemantics\n\nalloca.type ptr, size, immAlign:\n\na = max(immAlign, frame_align); // frame_align is the minimum guaranteed alignment\n\n// Allocate size bytes of stack memory with alignment a and update the stack pointer.\n\n// Since the stack grows down, the updated stack pointer contains a lower address.\n\nstackptr = alloc_stack_mem(size, a);\n\n// Return the new value of stack pointer as ptr. Since ptr is the lowest address of the memory\n\n// allocated by alloca, the memory can be accessed using ptr up to (ptr + size of allocated memory).\n\nstacksave ptr;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:alloca is a preview feature in PTX ISA version 7.3. All details are subject to change with no\n\nguarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nalloca requires sm_52 or higher.\n\nExamples\n\n.reg .u32 ra, stackptr, ptr, size;\n\nstacksave.u32 stackptr; // Save the current stack pointer\n\nalloca ptr, size, 8; // Allocate stack memory\n\nst.local.u32 [ptr], ra; // Use the allocated stack memory\n\nstackrestore.u32 stackptr; // Deallocate memory by restoring the stack pointer\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca" }; case "and": return { "html": "For more information, visit and .

Logic and Shift Instructions: and

\n\n\n

Bitwise AND.

Syntax

and.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n

Description

Compute the bit-wise and operation for the bits in a and b.

Semantics

d = a & b;\n

Notes

The size of the operands must match, but not necessarily the type.

Allowed types include predicate registers.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

and.b32  x,q,r;\nand.b32  sign,fpvalue,0x80000000;\n

", "tooltip": "Bitwise AND.\n\nSyntax\n\nand.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise and operation for the bits in a and b.\n\nSemantics\n\nd = a & b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nand.b32 x,q,r;\n\nand.b32 sign,fpvalue,0x80000000;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and" }; case "applypriority": return { "html": "For more information, visit applypriority .

Data Movement and Conversion Instructions: applypriority

\n\n\n

Apply the cache eviction priority to the specified address in the specified cache level.

Syntax

appplypriority{.global}.level::eviction_priority  [a], size;\n\n.level::eviction_priority = { .L2::evict_normal };\n

Description

The applypriority instruction applies the cache eviction priority specified by the\n.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache\nlevel.

If no state space is specified then Generic Addressing is\nused. If the specified address does not fall within the address window of .global state space\nthen the behavior is undefined.

The operand size is an integer constant that specifies the amount of data, in bytes, in the\nspecified cache level on which the priority is to be applied. The only supported value for the\nsize operand is 128.

Supported addressing modes for operand a are described in Addresses as Operands. a must be aligned to 128 bytes.

If the data pointed to by address a is not already present in the specified cache level, then\nthe data will be prefetched before applying the specified priority.

PTX ISA Notes

Introduced in PTX ISA version 7.4.

Target ISA Notes

Requires sm_80 or higher.

Examples

applypriority.global.L2::evict_normal [ptr], 128;\n

", "tooltip": "Apply the cache eviction priority to the specified address in the specified cache level.\n\nSyntax\n\nappplypriority{.global}.level::eviction_priority [a], size;\n\n.level::eviction_priority = { .L2::evict_normal };\n\nDescription\n\nThe applypriority instruction applies the cache eviction priority specified by the\n\n.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache\n\nlevel.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nThe operand size is an integer constant that specifies the amount of data, in bytes, in the\n\nspecified cache level on which the priority is to be applied. The only supported value for the\n\nsize operand is 128.\n\nSupported addressing modes for operand a are described in Addresses as Operands. a must be aligned to 128 bytes.\n\nIf the data pointed to by address a is not already present in the specified cache level, then\n\nthe data will be prefetched before applying the specified priority.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\napplypriority.global.L2::evict_normal [ptr], 128;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority" }; case "atom": return { "html": "For more information, visit atom .

Parallel Synchronization and Communication Instructions: atom

\n\n\n

Atomic reduction operations for thread-to-thread communication.

Syntax

Atomic operation with scalar type:

atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.op.type d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b16 d, [a], b, c;\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16     d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2   d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16    d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2  d, [a], b{, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n.sem =                { .relaxed, .acquire, .release, .acq_rel };\n.scope =              { .cta, .cluster, .gpu, .sys };\n\n.op =                 { .and, .or, .xor,\n                        .cas, .exch,\n                        .add, .inc, .dec,\n                        .min, .max };\n.level::cache_hint =  { .L2::cache_hint };\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n

Atomic operation with vector type:

atom{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32                  d, [a], b{, cache-policy};\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_16_bit.half_word_type  d, [a], b{, cache-policy};\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type     d, [a], b{, cache-policy};\n\n.sem =               { .relaxed, .acquire, .release, .acq_rel };\n.scope =             { .cta, .gpu, .sys };\n.op =                { .add, .min, .max };\n.half_word_type =    { .f16, .bf16 };\n.packed_type =       { .f16x2, .bf16x2 };\n.vec_16_bit =        { .v2, .v4, .v8 }\n.vec_32_bit =        { .v2, .v4 };\n.level::cache_hint = { .L2::cache_hint }\n

Description

Atomically loads the original value at location a into destination register d, performs a\nreduction operation with operand b and the value in location a, and stores the result of the\nspecified operation at location a, overwriting the original value. Operand a specifies a\nlocation in the specified state space. If no state space is given, perform the memory accesses using\nGeneric Addressing. atom with scalar type may be used only\nwith .global and .shared spaces and with generic addressing, where the address points to\n.global or .shared space. atom with vector type may be used only with .global space\nand with generic addressing where the address points to .global space.

For atom with vector type, operands d and b are brace-enclosed vector expressions, size\nof which is equal to the size of vector qualifier.

If no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.

The optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\nConsistency Model. If the .sem qualifier is absent,\n.relaxed is assumed by default.

The optional .scope qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in the Memory Consistency Model. If the .scope qualifier is absent, .gpu scope is\nassumed by default.

For atom with vector type, the supported combinations of vector qualifier and types, and atomic\noperations supported on these combinations are depicted in the following table:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Vector qualifier	Types
`.f16`/ `bf16`	`.f16x2`/ `bf16x2`	`.f32`
`.v2`	`.add`, `.min`, `.max`	`.add`, `.min`, `.max`	`.add`
`.v4`	`.add`, `.min`, `.max`	`.add`, `.min`, `.max`	`.add`
`.v8`	`.add`, `.min`, `.max`	Not supported	Not Supported

Two atomic operations {atom or red} are performed atomically with respect to each other only\nif each operation specifies a scope that includes the other. When this condition is not met, each\noperation observes the other operation being performed as if it were split into a read followed by a\ndependent write.

atom instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\nsuch cases, the atomicity is guaranteed separately for each of the individual scalar elements; the\nentire atom is not guaranteed to be atomic as a single access.

For sm_6x and earlier architectures, atom operations on .shared state space do not\nguarantee atomicity with respect to normal store instructions to the same address. It is the\nprogrammer\u2019s responsibility to guarantee correctness of programs that use shared memory atomic\ninstructions, e.g., by inserting barriers between normal stores and atomic operations to a common\naddress, or by using atom.exch to store to locations accessed by other atomic operations.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

The bit-size operations are .and, .or, .xor, .cas (compare-and-swap), and .exch\n(exchange).

The integer operations are .add, .inc, .dec, .min, .max. The .inc and\n.dec operations return a result in the range [0..b].

The floating-point operation .add operation rounds to nearest even. Current implementation of\natom.add.f32 on global memory flushes subnormal inputs and results to sign-preserving zero;\nwhereas atom.add.f32 on shared memory supports subnormal inputs and results and doesn\u2019t flush\nthem to zero.

atom.add.f16, atom.add.f16x2, atom.add.bf16 and atom.add.bf16x2 operation requires\nthe .noftz qualifier; it preserves subnormal inputs and results, and does not flush them to\nzero.

When the optional argument cache-policy is specified, the qualifier .level::cache_hint is\nrequired. The 64-bit operand cache-policy specifies the cache eviction policy that may be used\nduring the memory access.

The qualifier .level::cache_hint is only supported for .global state space and for generic\naddressing where the address points to the .global state space.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

Semantics

atomic {\n    d = *a;\n    *a = (operation == cas) ? operation(*a, b, c)\n                            : operation(*a, b);\n}\nwhere\n    inc(r, s)  = (r >= s) ? 0 : r+1;\n    dec(r, s)  = (r==0 || r > s)  ? s : r-1;\n    exch(r, s) =  s;\n    cas(r,s,t) = (r == s) ? t : r;\n

Notes

Simple reductions may be specified by using the bit bucket destination operand _.

PTX ISA Notes

32-bit atom.global introduced in PTX ISA version 1.1.

atom.shared and 64-bitatom.global.{add,cas,exch} introduced in PTX ISA 1.2.

atom.add.f32 and 64-bitatom.shared.{add,cas,exch} introduced in PTX ISA 2.0.

64-bit atom.{and,or,xor,min,max} introduced in PTX ISA 3.1.

atom.add.f64 introduced in PTX ISA 5.0.

.scope qualifier introduced in PTX ISA 5.0.

.sem qualifier introduced in PTX ISA version 6.0.

atom.add.noftz.f16x2 introduced in PTX ISA 6.2.

atom.add.noftz.f16 and atom.cas.b16 introduced in PTX ISA 6.3.

Per-element atomicity of atom.f16x2 clarified in PTX ISA version 6.3, with retrospective effect\nfrom PTX ISA version 6.2.

Support for .level::cache_hint qualifier introduced in PTX ISA version 7.4.

atom.add.noftz.bf16 and atom.add.noftz.bf16x2 introduced in PTX ISA 7.8.

Support for .cluster scope qualifier introduced in PTX ISA version 7.8.

Support for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.

Support for vector types introduced in PTX ISA version 8.1.

Target ISA Notes

atom.global requires sm_11 or higher.

atom.shared requires sm_12 or higher.

64-bit atom.global.{add,cas,exch} require sm_12 or higher.

64-bit atom.shared.{add,cas,exch} require sm_20 or higher.

64-bit atom.{and,or,xor,min,max} require sm_32 or higher.

atom.add.f32 requires sm_20 or higher.

atom.add.f64 requires sm_60 or higher.

.scope qualifier requires sm_60 or higher.

.sem qualifier requires sm_70 or higher.

Use of generic addressing requires sm_20 or higher.

atom.add.noftz.f16x2 requires sm_60 or higher.

atom.add.noftz.f16 and atom.cas.b16 requires sm_70 or higher.

Support for .level::cache_hint qualifier requires sm_80 or higher.

atom.add.noftz.bf16 and atom.add.noftz.bf16x2 require sm_90 or higher.

Support for .cluster scope qualifier requires sm_90 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for vector types requires sm_90 or higher.

Examples

atom.global.add.s32  d,[a],1;\natom.shared::cta.max.u32  d,[x+4],0;\n@p  atom.global.cas.b32  d,[p],my_val,my_new_val;\natom.global.sys.add.u32 d, [a], 1;\natom.global.acquire.sys.inc.u32 ans, [gbl], %r0;\natom.add.noftz.f16x2 d, [a], b;\natom.add.noftz.f16   hd, [ha], hb;\natom.global.cas.b16  hd, [ha], hb, hc;\natom.add.noftz.bf16   hd, [a], hb;\natom.add.noftz.bf16x2 bd, [b], bb;\natom.add.shared::cluster.noftz.f16   hd, [ha], hb;\n\natom.global.cluster.relaxed.add.u32 d, [a], 1;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\natom.global.add.L2::cache_hint.s32  d, [a], 1, cache-policy;\n\natom.global.v8.f16.max.noftz  {%hd0, %hd1, %hd2, %hd3, %hd4, %hd5, %hd6, %hd7}, [gbl],\n                                              {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\natom.global.v8.bf16.add.noftz  {%hd0, %hd1, %hd2, %hd3, %hd4, %hd5, %hd6, %hd7}, [gbl],\n                                              {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\natom.global.v2.f16.add.noftz  {%hd0, %hd1}, [gbl], {%h0, %h1};\natom.global.v2.bf16.add.noftz  {%hd0, %hd1}, [gbl], {%h0, %h1};\natom.global.v4.b16x2.min.noftz  {%hd0, %hd1, %hd2, %hd3}, [gbl], {%h0, %h1, %h2, %h3};\natom.global.v4.f32.add  {%f0, %f1, %f2, %f3}, [gbl], {%f0, %f1, %f2, %f3};\natom.global.v2.f16x2.min.noftz  {%bd0, %bd1}, [g], {%b0, %b1};\natom.global.v2.bf16x2.max.noftz  {%bd0, %bd1}, [g], {%b0, %b1};\natom.global.v2.f32.add  {%f0, %f1}, [g], {%f0, %f1};\n

", "tooltip": "Atomic reduction operations for thread-to-thread communication.\n\nSyntax\n\nAtomic operation with scalar type:\n\natom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.op.type d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b16 d, [a], b, c;\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16 d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2 d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16 d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2 d, [a], b{, cache-policy};\n\n.space = { .global, .shared{::cta, ::cluster} };\n\n.sem = { .relaxed, .acquire, .release, .acq_rel };\n\n.scope = { .cta, .cluster, .gpu, .sys };\n\n.op = { .and, .or, .xor,\n\n .cas, .exch,\n\n .add, .inc, .dec,\n\n .min, .max };\n\n.level::cache_hint = { .L2::cache_hint };\n\n.type = { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n\nAtomic operation with vector type:\n\natom{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32 d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_16_bit.half_word_type d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type d, [a], b{, cache-policy};\n\n.sem = { .relaxed, .acquire, .release, .acq_rel };\n\n.scope = { .cta, .gpu, .sys };\n\n.op = { .add, .min, .max };\n\n.half_word_type = { .f16, .bf16 };\n\n.packed_type = { .f16x2, .bf16x2 };\n\n.vec_16_bit = { .v2, .v4, .v8 }\n\n.vec_32_bit = { .v2, .v4 };\n\n.level::cache_hint = { .L2::cache_hint }\n\nDescription\n\nAtomically loads the original value at location a into destination register d, performs a\n\nreduction operation with operand b and the value in location a, and stores the result of the\n\nspecified operation at location a, overwriting the original value. Operand a specifies a\n\nlocation in the specified state space. If no state space is given, perform the memory accesses using\n\nGeneric Addressing. atom with scalar type may be used only\n\nwith .global and .shared spaces and with generic addressing, where the address points to\n\n.global or .shared space. atom with vector type may be used only with .global space\n\nand with generic addressing where the address points to .global space.\n\nFor atom with vector type, operands d and b are brace-enclosed vector expressions, size\n\nof which is equal to the size of vector qualifier.\n\nIf no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.\n\nThe optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\n\nConsistency Model. If the .sem qualifier is absent,\n\n.relaxed is assumed by default.\n\nThe optional .scope qualifier specifies the set of threads that can directly observe the memory\n\nsynchronizing effect of this operation, as described in the Memory Consistency Model. If the .scope qualifier is absent, .gpu scope is\n\nassumed by default.\n\nFor atom with vector type, the supported combinations of vector qualifier and types, and atomic\n\noperations supported on these combinations are depicted in the following table:\n\n\n\n\n\nVector qualifier\n\nTypes\n\n.f16/ bf16\n\n.f16x2/ bf16x2\n\n.f32\n\n.v2\n\n.add, .min, .max\n\n.add, .min, .max\n\n.add\n\n.v4\n\n.add, .min, .max\n\n.add, .min, .max\n\n.add\n\n.v8\n\n.add, .min, .max\n\nNot supported\n\nNot Supported\n\nTwo atomic operations {atom or red} are performed atomically with respect to each other only\n\nif each operation specifies a scope that includes the other. When this condition is not met, each\n\noperation observes the other operation being performed as if it were split into a read followed by a\n\ndependent write.\n\natom instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\n\nsuch cases, the atomicity is guaranteed separately fo ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom" }; case "bar": return { "html": "For more information, visit bar , bar.warp.sync .

Parallel Synchronization and Communication Instructions: bar, barrier

\n\n\n

Barrier synchronization.

Syntax

barrier{.cta}.sync{.aligned}      a{, b};\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n

Description

Performs barrier synchronization and communication within a CTA. Each CTA instance has sixteen\nbarriers numbered 0..15.

barrier{.cta} instructions can be used by the threads within the CTA for synchronization and\ncommunication.

Operands a, b, and d have type .u32; operands p and c are predicates. Source\noperand a specifies a logical barrier resource as an immediate constant or register with value\n0 through 15. Operand b specifies the number of threads participating in the barrier. If\nno thread count is specified, all threads in the CTA participate in the barrier. When specifying a\nthread count, the value must be a multiple of the warp size. Note that a non-zero thread count is\nrequired for barrier{.cta}.arrive.

Depending on operand b, either specified number of threads (in multiple of warp size) or all\nthreads in the CTA participate in barrier{.cta} instruction. The barrier{.cta} instructions\nsignal the arrival of the executing threads at the named barrier.

barrier{.cta} instruction causes executing thread to wait for all non-exited threads from its\nwarp and marks warps\u2019 arrival at barrier. In addition to signaling its arrival at the barrier, the\nbarrier{.cta}.red and barrier{.cta}.sync instructions causes executing thread to wait for\nnon-exited threads of all other warps participating in the barrier to\narrive. barrier{.cta}.arrive does not cause executing thread to wait for threads of other\nparticipating warps.

When a barrier completes, the waiting threads are restarted without delay, and the barrier is\nreinitialized so that it can be immediately reused.

The barrier{.cta}.sync or barrier{.cta}.red or barrier{.cta}.arrive instruction\nguarantees that when the barrier completes, prior memory accesses requested by this thread are\nperformed relative to all threads participating in the barrier. The barrier{.cta}.sync and\nbarrier{.cta}.red instruction further guarantees that no new memory access is requested by this\nthread before the barrier completes.

A memory read (e.g., by ld or atom) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread participating in the barrier. A\nmemory write (e.g., by st, red or atom) has been performed when the value written has\nbecome visible to other threads participating in the barrier, that is, when the previous value can\nno longer be read.

barrier{.cta}.red performs a reduction operation across threads. The c predicate (or its\ncomplement) from all threads in the CTA are combined using the specified reduction operator. Once\nthe barrier count is reached, the final value is written to the destination register in all threads\nwaiting at the barrier.

The reduction operations for barrier{.cta}.red are population-count (.popc),\nall-threads-True (.and), and any-thread-True (.or). The result of .popc is the number of\nthreads with a True predicate, while .and and .or indicate if all the threads had a\nTrue predicate or if any of the threads had a True predicate.

Instruction barrier{.cta} has optional .aligned modifier. When specified, it indicates that\nall threads in CTA will execute the same barrier{.cta} instruction. In conditionally executed\ncode, an aligned barrier{.cta} instruction should only be used if it is known that all threads\nin CTA evaluate the condition identically, otherwise behavior is undefined.

Different warps may execute different forms of the barrier{.cta} instruction using the same\nbarrier name and thread count. One example mixes barrier{.cta}.sync and barrier{.cta}.arrive\nto implement producer/consumer models. The producer threads execute barrier{.cta}.arrive to\nannounce their arrival at the barrier and continue execution without delay to produce the next\nvalue, while the consumer threads execute the barrier{.cta}.sync to wait for a resource to be\nproduced. The roles are then reversed, using a different barrier, where the producer threads execute\na barrier{.cta}.sync to wait for a resource to consumed, while the consumer threads announce\nthat the resource has been consumed with barrier{.cta}.arrive. Care must be taken to keep a warp\nfrom executing more barrier{.cta} instructions than intended (barrier{.cta}.arrive followed\nby any other barrier{.cta} instruction to the same barrier) prior to the reset of the\nbarrier. barrier{.cta}.red should not be intermixed with barrier{.cta}.sync or\nbarrier{.cta}.arrive using the same active barrier. Execution in this case is unpredictable.

The optional .cta qualifier simply indicates CTA-level applicability of the barrier and it\ndoesn\u2019t change the semantics of the instruction.

bar{.cta}.sync is equivalent to barrier{.cta}.sync.aligned. bar{.cta}.arrive is\nequivalent to barrier{.cta}.arrive.aligned. bar{.cta}.red is equivalent to\nbarrier{.cta}.red.aligned.

Note

For .target sm_6x or below,

barrier{.cta} instruction without .aligned modifier is equivalent to .aligned\nvariant and has the same restrictions as of .aligned variant.
All threads in warp (except for those have exited) must execute barrier{.cta} instruction\nin convergence.

PTX ISA Notes

bar.sync without a thread count introduced in PTX ISA version 1.0.

barrier instruction introduced in PTX ISA version 6.0.

.cta qualifier introduced in PTX ISA version 7.8.

Target ISA Notes

Only bar{.cta}.sync with an immediate barrier number is supported for sm_1x targets.

barrier{.cta} instruction requires sm_30 or higher.

Examples

// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for all threads in CTA to also arrive:\n    st.shared [r0],r1;  // write my result to shared memory\n    bar.cta.sync  1;    // arrive, wait for others to arrive\n    ld.shared r2,[r3];  // use shared results from other threads\n\n// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for fixed number of cooperating threads to arrive:\n    #define CNT1 (8*12) // Number of cooperating threads\n\n    st.shared [r0],r1;     // write my result to shared memory\n    bar.cta.sync  1, CNT1; // arrive, wait for others to arrive\n    ld.shared r2,[r3];     // use shared results from other threads\n\n// Use bar.red.and to compare results across the entire CTA:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.and.pred r3,1,p; // r3=AND(p) forall threads in CTA\n\n// Use bar.red.popc to compute the size of a group of threads\n// that have a specific condition True:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.popc.u32 r3,1,p; // r3=SUM(p) forall threads in CTA\n\n/* Producer/consumer model. The producer deposits a value in\n * shared memory, signals that it is complete but does not wait\n * using bar.arrive, and begins fetching more data from memory.\n * Once the data returns from memory, the producer must wait\n * until the consumer signals that it has read the value from\n * the shared memory location. In the meantime, a consumer\n * thread waits until the data is stored by the producer, reads\n * it, and then signals that it is done (without waiting).\n */\n    // Producer code places produced value in shared memory.\n    st.shared   [r0],r1;\n    bar.arrive  0,64;\n    ld.global   r1,[r2];\n    bar.sync    1,64;\n    ...\n\n    // Consumer code, reads value from shared memory\n    bar.sync   0,64;\n    ld.shared  r1,[r0];\n    bar.arrive 1,64;\n    ...\n\n    // Examples of barrier.cta.sync\n    st.shared         [r0],r1;\n    barrier.cta.sync  0;\n    ld.shared         r1, [r0];\n

Parallel Synchronization and Communication Instructions: bar.warp.sync

\n\n\n

Barrier synchronization for threads in a warp.

Syntax

bar.warp.sync      membermask;\n

Description

bar.warp.sync will cause executing thread to wait until all threads corresponding to\nmembermask have executed a bar.warp.sync with the same membermask value before resuming\nexecution.

Operand membermask specifies a 32-bit integer which is a mask indicating threads participating\nin barrier where the bit position corresponds to thread\u2019s laneid.

The behavior of bar.warp.sync is undefined if the executing thread is not in the membermask.

bar.warp.sync also guarantee memory ordering among threads participating in barrier. Thus,\nthreads within warp that wish to communicate via memory can store to memory, execute\nbar.warp.sync, and then safely read values stored by other threads in warp.

Note

For .target sm_6x or below, all threads in membermask must execute the same\nbar.warp.sync instruction in convergence, and only threads belonging to some membermask\ncan be active when the bar.warp.sync instruction is executed. Otherwise, the behavior is\nundefined.

PTX ISA Notes

Introduced in PTX ISA version 6.0.

Target ISA Notes

Requires sm_30 or higher.

Examples

st.shared.u32 [r0],r1;         // write my result to shared memory\nbar.warp.sync  0xffffffff;     // arrive, wait for others to arrive\nld.shared.u32 r2,[r3];         // read results written by other threads\n

", "tooltip": "=====Parallel Synchronization and Communication Instructions: bar, barrier\n\n\n\nBarrier synchronization.\n\nSyntax\n\nbarrier{.cta}.sync{.aligned} a{, b};\n\nbarrier{.cta}.arrive{.aligned} a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32 d, a{, b}, {!}c;\n\nbarrier{.cta}.red.op{.aligned}.pred p, a{, b}, {!}c;\n\nbar{.cta}.sync a{, b};\n\nbar{.cta}.arrive a, b;\n\nbar{.cta}.red.popc.u32 d, a{, b}, {!}c;\n\nbar{.cta}.red.op.pred p, a{, b}, {!}c;\n\n.op = { .and, .or };\n\nDescription\n\nPerform...\n\n=====Parallel Synchronization and Communication Instructions: bar.warp.sync\n\n\n\nBarrier synchronization for threads in a warp.\n\nSyntax\n\nbar.warp.sync membermask;\n\nDescription\n\nbar.warp.sync will cause executing thread to wait until all threads corresponding to\n\nmembermask have executed a bar.warp.sync with the same membermask value before resuming\n\nexecution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin barrier where the bit... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier" }; case "barrier": return { "html": "For more information, visit barrier , barrier.cluster .

Parallel Synchronization and Communication Instructions: bar, barrier

\n\n\n

Barrier synchronization.

Syntax

barrier{.cta}.sync{.aligned}      a{, b};\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n

Description

Performs barrier synchronization and communication within a CTA. Each CTA instance has sixteen\nbarriers numbered 0..15.

barrier{.cta} instructions can be used by the threads within the CTA for synchronization and\ncommunication.

When a barrier completes, the waiting threads are restarted without delay, and the barrier is\nreinitialized so that it can be immediately reused.

The optional .cta qualifier simply indicates CTA-level applicability of the barrier and it\ndoesn\u2019t change the semantics of the instruction.

bar{.cta}.sync is equivalent to barrier{.cta}.sync.aligned. bar{.cta}.arrive is\nequivalent to barrier{.cta}.arrive.aligned. bar{.cta}.red is equivalent to\nbarrier{.cta}.red.aligned.

Note

For .target sm_6x or below,

barrier{.cta} instruction without .aligned modifier is equivalent to .aligned\nvariant and has the same restrictions as of .aligned variant.
All threads in warp (except for those have exited) must execute barrier{.cta} instruction\nin convergence.

PTX ISA Notes

bar.sync without a thread count introduced in PTX ISA version 1.0.

barrier instruction introduced in PTX ISA version 6.0.

.cta qualifier introduced in PTX ISA version 7.8.

Target ISA Notes

Only bar{.cta}.sync with an immediate barrier number is supported for sm_1x targets.

barrier{.cta} instruction requires sm_30 or higher.

Examples

// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for all threads in CTA to also arrive:\n    st.shared [r0],r1;  // write my result to shared memory\n    bar.cta.sync  1;    // arrive, wait for others to arrive\n    ld.shared r2,[r3];  // use shared results from other threads\n\n// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for fixed number of cooperating threads to arrive:\n    #define CNT1 (8*12) // Number of cooperating threads\n\n    st.shared [r0],r1;     // write my result to shared memory\n    bar.cta.sync  1, CNT1; // arrive, wait for others to arrive\n    ld.shared r2,[r3];     // use shared results from other threads\n\n// Use bar.red.and to compare results across the entire CTA:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.and.pred r3,1,p; // r3=AND(p) forall threads in CTA\n\n// Use bar.red.popc to compute the size of a group of threads\n// that have a specific condition True:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.popc.u32 r3,1,p; // r3=SUM(p) forall threads in CTA\n\n/* Producer/consumer model. The producer deposits a value in\n * shared memory, signals that it is complete but does not wait\n * using bar.arrive, and begins fetching more data from memory.\n * Once the data returns from memory, the producer must wait\n * until the consumer signals that it has read the value from\n * the shared memory location. In the meantime, a consumer\n * thread waits until the data is stored by the producer, reads\n * it, and then signals that it is done (without waiting).\n */\n    // Producer code places produced value in shared memory.\n    st.shared   [r0],r1;\n    bar.arrive  0,64;\n    ld.global   r1,[r2];\n    bar.sync    1,64;\n    ...\n\n    // Consumer code, reads value from shared memory\n    bar.sync   0,64;\n    ld.shared  r1,[r0];\n    bar.arrive 1,64;\n    ...\n\n    // Examples of barrier.cta.sync\n    st.shared         [r0],r1;\n    barrier.cta.sync  0;\n    ld.shared         r1, [r0];\n

Parallel Synchronization and Communication Instructions: barrier.cluster

\n\n\n

Barrier synchronization within a cluster.

Syntax

barrier.cluster.arrive{.sem}{.aligned};\nbarrier.cluster.wait{.acquire}{.aligned};\n\n.sem = {.release, .relaxed}\n

Description

Performs barrier synchronization and communication within a cluster.

barrier.cluster instructions can be used by the threads within the cluster for synchronization\nand communication.

barrier.cluster.arrive instruction marks warps\u2019 arrival at barrier without causing executing\nthread to wait for threads of other participating warps.

barrier.cluster.wait instruction causes the executing thread to wait for all non-exited threads\nof the cluster to perform barrier.cluster.arrive.

In addition, barrier.cluster instructions cause the executing thread to wait for all non-exited\nthreads from its warp.

When all non-exited threads that executed barrier.cluster.arrive have executed\nbarrier.cluster.wait, the barrier completes and is reinitialized so it can be reused\nimmediately. Each thread must arrive at the barrier only once before the barrier completes.

The barrier.cluster.wait instruction guarantees that when it completes the execution, memory\naccesses (except asynchronous operations) requested, in program order, prior to the preceding\nbarrier.cluster.arrive by all threads in the cluster are complete and visible to the executing\nthread.

There is no memory ordering and visibility guarantee for memory accesses requested by the executing\nthread, in program order, after barrier.cluster.arrive and prior to barrier.cluster.wait.

The optional .relaxed qualifier on barrier.cluster.arrive specifies that there are no memory\nordering and visibility guarantees provided for the memory accesses performed prior to\nbarrier.cluster.arrive.

The optional .sem and .acquire qualifiers on instructions barrier.cluster.arrive and\nbarrier.cluster.wait specify the memory synchronization as described in the Memory Consistency\nModel. If the optional .sem qualifier is absent for\nbarrier.cluster.arrive, .release is assumed by default. If the optional .acquire\nqualifier is absent for barrier.cluster.wait, .acquire is assumed by default.

The optional .aligned qualifier indicates that all threads in the warp must execute the same\nbarrier.cluster instruction. In conditionally executed code, an aligned barrier.cluster\ninstruction should only be used if it is known that all threads in the warp evaluate the condition\nidentically, otherwise behavior is undefined.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Support for .acquire, .relaxed, .release qualifiers introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

// use of arrive followed by wait\nld.shared::cluster.u32 r0, [addr];\nbarrier.cluster.arrive.aligned;\n...\nbarrier.cluster.wait.aligned;\nst.shared::cluster.u32 [addr], r1;\n\n// use memory fence prior to arrive for relaxed barrier\n@cta0 ld.shared::cluster.u32 r0, [addr];\nfence.cluster.acq_rel;\nbarrier.cluster.arrive.relaxed.aligned;\n...\nbarrier.cluster.wait.aligned;\n@cta1 st.shared::cluster.u32 [addr], r1;\n

", "tooltip": "=====Parallel Synchronization and Communication Instructions: bar, barrier\n\n\n\nBarrier synchronization.\n\nSyntax\n\nbarrier{.cta}.sync{.aligned} a{, b};\n\nbarrier{.cta}.arrive{.aligned} a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32 d, a{, b}, {!}c;\n\nbarrier{.cta}.red.op{.aligned}.pred p, a{, b}, {!}c;\n\nbar{.cta}.sync a{, b};\n\nbar{.cta}.arrive a, b;\n\nbar{.cta}.red.popc.u32 d, a{, b}, {!}c;\n\nbar{.cta}.red.op.pred p, a{, b}, {!}c;\n\n.op = { .and, .or };\n\nDescription\n\nPerform...\n\n=====Parallel Synchronization and Communication Instructions: barrier.cluster\n\n\n\nBarrier synchronization within a cluster.\n\nSyntax\n\nbarrier.cluster.arrive{.sem}{.aligned};\n\nbarrier.cluster.wait{.acquire}{.aligned};\n\n.sem = {.release, .relaxed}\n\nDescription\n\nPerforms barrier synchronization and communication within a cluster.\n\nbarrier.cluster instructions can be used by the threads within the cluster for synchronization\n\nand communication.\n\nbarrier.cluster.arrive instruction marks warps... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier" }; case "bfe": return { "html": "For more information, visit bfe(int) .

Integer Arithmetic Instructions: bfe

\n\n\n

Bit Field Extract.

Syntax

bfe.type  d, a, b, c;\n\n.type = { .u32, .u64,\n          .s32, .s64 };\n

Description

Extract bit field from a and place the zero or sign-extended result in d. Source b gives\nthe bit field starting bit position, and source c gives the bit field length in bits.

Operands a and d have the same type as the instruction type. Operands b and c are\ntype .u32, but are restricted to the 8-bit value range 0..255.

The sign bit of the extracted field is defined as:

.u32, .u64:: zero
\n
.s32, .s64:: msb of input a if the extracted field extends beyond the msb of a msb of extracted\nfield, otherwise
\n

If the bit field length is zero, the result is zero.

The destination d is padded with the sign bit of the extracted field. If the start position is\nbeyond the msb of the input, the destination d is filled with the replicated sign bit of the\nextracted field.

Semantics

msb = (.type==.u32 || .type==.s32) ? 31 : 63;\npos = b & 0xff;  // pos restricted to 0..255 range\nlen = c & 0xff;  // len restricted to 0..255 range\n\nif (.type==.u32 || .type==.u64 || len==0)\n    sbit = 0;\nelse\n    sbit = a[min(pos+len-1,msb)];\n\nd = 0;\nfor (i=0; i<=msb; i++) {\n    d[i] = (i<len && pos+i<=msb) ? a[pos+i] : sbit;\n}\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

bfe requires sm_20 or higher.

Examples

bfe.b32  d,a,start,len;\n

", "tooltip": "Bit Field Extract.\n\nSyntax\n\nbfe.type d, a, b, c;\n\n.type = { .u32, .u64,\n\n .s32, .s64 };\n\nDescription\n\nExtract bit field from a and place the zero or sign-extended result in d. Source b gives\n\nthe bit field starting bit position, and source c gives the bit field length in bits.\n\nOperands a and d have the same type as the instruction type. Operands b and c are\n\ntype .u32, but are restricted to the 8-bit value range 0..255.\n\nThe sign bit of the extracted field is defined as:\n\n.u32, .u64:zero\n\n.s32, .s64:msb of input a if the extracted field extends beyond the msb of a msb of extracted\n\nfield, otherwise\n\nIf the bit field length is zero, the result is zero.\n\nThe destination d is padded with the sign bit of the extracted field. If the start position is\n\nbeyond the msb of the input, the destination d is filled with the replicated sign bit of the\n\nextracted field.\n\nSemantics\n\nmsb = (.type==.u32 || .type==.s32) ? 31 : 63;\n\npos = b & 0xff; // pos restricted to 0..255 range\n\nlen = c & 0xff; // len restricted to 0..255 range\n\nif (.type==.u32 || .type==.u64 || len==0)\n\n sbit = 0;\n\nelse\n\n sbit = a[min(pos+len-1,msb)];\n\nd = 0;\n\nfor (i=0; i<=msb; i++) {\n\n d[i] = (ibfi(int) .

Integer Arithmetic Instructions: bfi

\n\n\n

Bit Field Insert.

Syntax

bfi.type  f, a, b, c, d;\n\n.type = { .b32, .b64 };\n

Description

Align and insert a bit field from a into b, and place the result in f. Source c\ngives the starting bit position for the insertion, and source d gives the bit field length in\nbits.

Operands a, b, and f have the same type as the instruction type. Operands c and\nd are type .u32, but are restricted to the 8-bit value range 0..255.

If the bit field length is zero, the result is b.

If the start position is beyond the msb of the input, the result is b.

Semantics

msb = (.type==.b32) ? 31 : 63;\npos = c & 0xff;  // pos restricted to 0..255 range\nlen = d & 0xff;  // len restricted to 0..255 range\n\nf = b;\nfor (i=0; i<len && pos+i<=msb; i++) {\n    f[pos+i] = a[i];\n}\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

bfi requires sm_20 or higher.

Examples

bfi.b32  d,a,b,start,len;\n

", "tooltip": "Bit Field Insert.\n\nSyntax\n\nbfi.type f, a, b, c, d;\n\n.type = { .b32, .b64 };\n\nDescription\n\nAlign and insert a bit field from a into b, and place the result in f. Source c\n\ngives the starting bit position for the insertion, and source d gives the bit field length in\n\nbits.\n\nOperands a, b, and f have the same type as the instruction type. Operands c and\n\nd are type .u32, but are restricted to the 8-bit value range 0..255.\n\nIf the bit field length is zero, the result is b.\n\nIf the start position is beyond the msb of the input, the result is b.\n\nSemantics\n\nmsb = (.type==.b32) ? 31 : 63;\n\npos = c & 0xff; // pos restricted to 0..255 range\n\nlen = d & 0xff; // len restricted to 0..255 range\n\nf = b;\n\nfor (i=0; ibfind(int) .

Integer Arithmetic Instructions: bfind

\n\n\n

Find most significant non-sign bit.

Syntax

bfind.type           d, a;\nbfind.shiftamt.type  d, a;\n\n.type = { .u32, .u64,\n          .s32, .s64 };\n

Description

Find the bit position of the most significant non-sign bit in a and place the result in\nd. Operand a has the instruction type, and destination d has type .u32. For unsigned\nintegers, bfind returns the bit position of the most significant 1. For signed integers,\nbfind returns the bit position of the most significant 0 for negative inputs and the most\nsignificant 1 for non-negative inputs.

If .shiftamt is specified, bfind returns the shift amount needed to left-shift the found bit\ninto the most-significant bit position.

bfind returns 0xffffffff if no non-sign bit is found.

Semantics

msb = (.type==.u32 || .type==.s32) ? 31 : 63;\n// negate negative signed inputs\nif ( (.type==.s32 || .type==.s64) && (a & (1<<msb)) ) {\n    a = ~a;\n}\n.u32  d = 0xffffffff;\nfor (.s32 i=msb; i>=0; i--) {\n    if (a & (1<<i))  { d = i; break; }\n}\nif (.shiftamt && d != 0xffffffff)  { d = msb - d; }\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

bfind requires sm_20 or higher.

Examples

bfind.u32  d, a;\nbfind.shiftamt.s64  cnt, X;  // cnt is .u32\n

", "tooltip": "Find most significant non-sign bit.\n\nSyntax\n\nbfind.type d, a;\n\nbfind.shiftamt.type d, a;\n\n.type = { .u32, .u64,\n\n .s32, .s64 };\n\nDescription\n\nFind the bit position of the most significant non-sign bit in a and place the result in\n\nd. Operand a has the instruction type, and destination d has type .u32. For unsigned\n\nintegers, bfind returns the bit position of the most significant 1. For signed integers,\n\nbfind returns the bit position of the most significant 0 for negative inputs and the most\n\nsignificant 1 for non-negative inputs.\n\nIf .shiftamt is specified, bfind returns the shift amount needed to left-shift the found bit\n\ninto the most-significant bit position.\n\nbfind returns 0xffffffff if no non-sign bit is found.\n\nSemantics\n\nmsb = (.type==.u32 || .type==.s32) ? 31 : 63;\n\n// negate negative signed inputs\n\nif ( (.type==.s32 || .type==.s64) && (a & (1<=0; i--) {\n\n if (a & (1<bmsk(int) .

Integer Arithmetic Instructions: bmsk

\n\n\n

Bit Field Mask.

Syntax

bmsk.mode.b32  d, a, b;\n\n.mode = { .clamp, .wrap };\n

Description

Generates a 32-bit mask starting from the bit position specified in operand a, and of the width\nspecified in operand b. The generated bitmask is stored in the destination operand d.

The resulting bitmask is 0 in the following cases:

When the value of a is 32 or higher and .mode is .clamp.
When either the specified value of b or the wrapped value of b (when .mode is\nspecified as .wrap) is 0.

Semantics

a1    = a & 0x1f;\nmask0 = (~0) << a1;\nb1    = b & 0x1f;\nsum   = a1 + b1;\nmask1 = (~0) << sum;\n\nsum-overflow          = sum >= 32 ? true : false;\nbit-position-overflow = false;\nbit-width-overflow    = false;\n\nif (.mode == .clamp) {\n    if (a >= 32) {\n        bit-position-overflow = true;\n        mask0 = 0;\n    }\n    if (b >= 32) {\n        bit-width-overflow = true;\n    }\n}\n\nif (sum-overflow || bit-position-overflow || bit-width-overflow) {\n    mask1 = 0;\n} else if (b1 == 0) {\n    mask1 = ~0;\n}\nd = mask0 & ~mask1;\n

Notes

The bitmask width specified by operand b is limited to range 0..32 in .clamp mode and to\nrange 0..31 in .wrap mode.

PTX ISA Notes

Introduced in PTX ISA version 7.6.

Target ISA Notes

bmsk requires sm_70 or higher.

Examples

bmsk.clamp.b32  rd, ra, rb;\nbmsk.wrap.b32   rd, 1, 2; // Creates a bitmask of 0x00000006.\n

", "tooltip": "Bit Field Mask.\n\nSyntax\n\nbmsk.mode.b32 d, a, b;\n\n.mode = { .clamp, .wrap };\n\nDescription\n\nGenerates a 32-bit mask starting from the bit position specified in operand a, and of the width\n\nspecified in operand b. The generated bitmask is stored in the destination operand d.\n\nThe resulting bitmask is 0 in the following cases:\n\nWhen the value of a is 32 or higher and .mode is .clamp.\n\nWhen either the specified value of b or the wrapped value of b (when .mode is\n\nspecified as .wrap) is 0.\n\nSemantics\n\na1 = a & 0x1f;\n\nmask0 = (~0) << a1;\n\nb1 = b & 0x1f;\n\nsum = a1 + b1;\n\nmask1 = (~0) << sum;\n\nsum-overflow = sum >= 32 ? true : false;\n\nbit-position-overflow = false;\n\nbit-width-overflow = false;\n\nif (.mode == .clamp) {\n\n if (a >= 32) {\n\n bit-position-overflow = true;\n\n mask0 = 0;\n\n }\n\n if (b >= 32) {\n\n bit-width-overflow = true;\n\n }\n\n}\n\nif (sum-overflow || bit-position-overflow || bit-width-overflow) {\n\n mask1 = 0;\n\n} else if (b1 == 0) {\n\n mask1 = ~0;\n\n}\n\nd = mask0 & ~mask1;\n\nNotes\n\nThe bitmask width specified by operand b is limited to range 0..32 in .clamp mode and to\n\nrange 0..31 in .wrap mode.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nbmsk requires sm_70 or higher.\n\nExamples\n\nbmsk.clamp.b32 rd, ra, rb;\n\nbmsk.wrap.b32 rd, 1, 2; // Creates a bitmask of 0x00000006.\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk" }; case "bra": return { "html": "For more information, visit bra .

Control Flow Instructions: bra

\n\n\n

Branch to a target and continue execution there.

Syntax

@p   bra{.uni}  tgt;           // tgt is a label\n     bra{.uni}  tgt;           // unconditional branch\n

Description

Continue execution at the target. Conditional branches are specified by using a guard predicate. The\nbranch target must be a label.

bra.uni is guaranteed to be non-divergent, i.e. all active threads in a warp that are currently\nexecuting this instruction have identical values for the guard predicate and branch target.

Semantics

if (p) {\n    pc = tgt;\n}\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Unimplemented indirect branch introduced in PTX ISA version 2.1 has been removed from the spec.

Target ISA Notes

Supported on all target architectures.

Examples

bra.uni  L_exit;    // uniform unconditional jump\n@q  bra      L23;   // conditional branch\n

", "tooltip": "Branch to a target and continue execution there.\n\nSyntax\n\n@p bra{.uni} tgt; // tgt is a label\n\n bra{.uni} tgt; // unconditional branch\n\nDescription\n\nContinue execution at the target. Conditional branches are specified by using a guard predicate. The\n\nbranch target must be a label.\n\nbra.uni is guaranteed to be non-divergent, i.e. all active threads in a warp that are currently\n\nexecuting this instruction have identical values for the guard predicate and branch target.\n\nSemantics\n\nif (p) {\n\n pc = tgt;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nUnimplemented indirect branch introduced in PTX ISA version 2.1 has been removed from the spec.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nbra.uni L_exit; // uniform unconditional jump\n\n@q bra L23; // conditional branch\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra" }; case "branchtargets": return { "html": "For more information, visit branchtargets .

Control Flow Directives: .branchtargets

\n\n\n

Declare a list of potential branch targets.

Syntax

Label:   .branchtargets  list-of-labels ;\n

Description

Declares a list of potential branch targets for a subsequent brx.idx, and associates the list\nwith the label at the start of the line.

All control flow labels in the list must occur within the same function as the declaration.

The list of labels may use the compact, shorthand syntax for enumerating a range of labels having a\ncommon prefix, similar to the syntax described in Parameterized Variable Names.

PTX ISA Notes

Introduced in PTX ISA version 2.1.

Target ISA Notes

Requires sm_20 or higher.

Examples

  .function foo () {\n      .reg .u32 %r0;\n      ...\n      L1:\n      ...\n      L2:\n      ...\n      L3:\n      ...\n      ts: .branchtargets L1, L2, L3;\n      @p brx.idx %r0, ts;\n      ...\n\n.function bar() {\n      .reg .u32 %r0;\n      ...\n      N0:\n      ...\n      N1:\n      ...\n      N2:\n      ...\n      N3:\n      ...\n      N4:\n      ...\n      ts: .branchtargets N<5>;\n      @p brx.idx %r0, ts;\n      ...\n

", "tooltip": "Declare a list of potential branch targets.\n\nSyntax\n\nLabel: .branchtargets list-of-labels ;\n\nDescription\n\nDeclares a list of potential branch targets for a subsequent brx.idx, and associates the list\n\nwith the label at the start of the line.\n\nAll control flow labels in the list must occur within the same function as the declaration.\n\nThe list of labels may use the compact, shorthand syntax for enumerating a range of labels having a\n\ncommon prefix, similar to the syntax described in Parameterized Variable Names.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\n .function foo () {\n\n .reg .u32 %r0;\n\n ...\n\n L1:\n\n ...\n\n L2:\n\n ...\n\n L3:\n\n ...\n\n ts: .branchtargets L1, L2, L3;\n\n @p brx.idx %r0, ts;\n\n ...\n\n.function bar() {\n\n .reg .u32 %r0;\n\n ...\n\n N0:\n\n ...\n\n N1:\n\n ...\n\n N2:\n\n ...\n\n N3:\n\n ...\n\n N4:\n\n ...\n\n ts: .branchtargets N<5>;\n\n @p brx.idx %r0, ts;\n\n ...\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-branchtargets" }; case "brev": return { "html": "For more information, visit brev(int) .

Integer Arithmetic Instructions: brev

\n\n\n

Bit reverse.

Syntax

brev.type  d, a;\n\n.type = { .b32, .b64 };\n

Description

Perform bitwise reversal of input.

Semantics

msb = (.type==.b32) ? 31 : 63;\n\nfor (i=0; i<=msb; i++) {\n    d[i] = a[msb-i];\n}\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

brev requires sm_20 or higher.

Examples

brev.b32  d, a;\n

", "tooltip": "Bit reverse.\n\nSyntax\n\nbrev.type d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nPerform bitwise reversal of input.\n\nSemantics\n\nmsb = (.type==.b32) ? 31 : 63;\n\nfor (i=0; i<=msb; i++) {\n\n d[i] = a[msb-i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbrev requires sm_20 or higher.\n\nExamples\n\nbrev.b32 d, a;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev" }; case "brkpt": return { "html": "For more information, visit brkpt .

Miscellaneous Instructions: brkpt

\n\n\n

Breakpoint.

Syntax

brkpt;\n

Description

Suspends execution.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

brkpt requires sm_11 or higher.

Examples

    brkpt;\n@p  brkpt;\n

", "tooltip": "Breakpoint.\n\nSyntax\n\nbrkpt;\n\nDescription\n\nSuspends execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nbrkpt requires sm_11 or higher.\n\nExamples\n\n brkpt;\n\n@p brkpt;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt" }; case "brx": return { "html": "For more information, visit brx.idx .

Control Flow Instructions: brx.idx

\n\n\n

Branch to a label indexed from a list of potential branch targets.

Syntax

@p    brx.idx{.uni} index, tlist;\n      brx.idx{.uni} index, tlist;\n

Description

Index into a list of possible destination labels, and continue execution from the chosen\nlabel. Conditional branches are specified by using a guard predicate.

brx.idx.uni guarantees that the branch is non-divergent, i.e. all active threads in a warp that\nare currently executing this instruction have identical values for the guard predicate and the\nindex argument.

The index operand is a .u32 register. The tlist operand must be the label of a\n.branchtargets directive. It is accessed as a zero-based sequence using index. Behaviour is\nundefined if the value of index is greater than or equal to the length of tlist.

The .branchtargets directive must be defined in the local function scope before it is used. It\nmust refer to labels within the current function.

Semantics

if (p) {\n    if (index < length(tlist)) {\n      pc = tlist[index];\n    } else {\n      pc = undefined;\n    }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 6.0.

Target ISA Notes

Requires sm_30 or higher.

Examples

.function foo () {\n    .reg .u32 %r0;\n    ...\n    L1:\n    ...\n    L2:\n    ...\n    L3:\n    ...\n    ts: .branchtargets L1, L2, L3;\n    @p brx.idx %r0, ts;\n    ...\n}\n

", "tooltip": "Branch to a label indexed from a list of potential branch targets.\n\nSyntax\n\n@p brx.idx{.uni} index, tlist;\n\n brx.idx{.uni} index, tlist;\n\nDescription\n\nIndex into a list of possible destination labels, and continue execution from the chosen\n\nlabel. Conditional branches are specified by using a guard predicate.\n\nbrx.idx.uni guarantees that the branch is non-divergent, i.e. all active threads in a warp that\n\nare currently executing this instruction have identical values for the guard predicate and the\n\nindex argument.\n\nThe index operand is a .u32 register. The tlist operand must be the label of a\n\n.branchtargets directive. It is accessed as a zero-based sequence using index. Behaviour is\n\nundefined if the value of index is greater than or equal to the length of tlist.\n\nThe .branchtargets directive must be defined in the local function scope before it is used. It\n\nmust refer to labels within the current function.\n\nSemantics\n\nif (p) {\n\n if (index < length(tlist)) {\n\n pc = tlist[index];\n\n } else {\n\n pc = undefined;\n\n }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\n.function foo () {\n\n .reg .u32 %r0;\n\n ...\n\n L1:\n\n ...\n\n L2:\n\n ...\n\n L3:\n\n ...\n\n ts: .branchtargets L1, L2, L3;\n\n @p brx.idx %r0, ts;\n\n ...\n\n}\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx" }; case "call": return { "html": "For more information, visit call .

Control Flow Instructions: call

\n\n\n

Call a function, recording the return location.

Syntax

// direct call to named function, func is a symbol\ncall{.uni} (ret-param), func, (param-list);\ncall{.uni} func, (param-list);\ncall{.uni} func;\n\n// indirect call via pointer, with full list of call targets\ncall{.uni} (ret-param), fptr, (param-list), flist;\ncall{.uni} fptr, (param-list), flist;\ncall{.uni} fptr, flist;\n\n// indirect call via pointer, with no knowledge of call targets\ncall{.uni} (ret-param), fptr, (param-list), fproto;\ncall{.uni} fptr, (param-list), fproto;\ncall{.uni} fptr, fproto;\n

Description

The call instruction stores the address of the next instruction, so execution can resume at that\npoint after executing a ret instruction. A call is assumed to be divergent unless the\n.uni suffix is present. The .uni suffix indicates that the call is guaranteed to be\nnon-divergent, i.e. all active threads in a warp that are currently executing this instruction have\nidentical values for the guard predicate and call target.

For direct calls, the called location func must be a symbolic function name; for indirect calls,\nthe called location fptr must be an address of a function held in a register. Input arguments\nand return values are optional.\u00a0Arguments may be registers, immediate constants, or variables in\n.param space. Arguments are pass-by-value.

Indirect calls require an additional operand, flist or fproto, to communicate the list of\npotential call targets or the common function prototype of all call targets,\nrespectively. In the first case, flist gives a complete list of potential call targets and\nthe optimizing backend is free to optimize the calling convention. In the second case, where the\ncomplete list of potential call targets may not be known, the common function prototype is given\nand the call must obey the ABI\u2019s calling convention.

The flist operand is either the name of an array (call table) initialized to a list of function\nnames; or a label associated with a .calltargets directive, which declares a list of potential\ncall targets. In both cases the fptr register holds the address of a function listed in the call\ntable or .calltargets list, and the call operands are type-checked against the type\nsignature of the functions indicated by flist.

The fproto operand is the name of a label associated with a .callprototype directive. This\noperand is used when a complete list of potential targets is not known. The call operands are\ntype-checked against the prototype, and code generation will follow the ABI calling convention. If a\nfunction that doesn\u2019t match the prototype is called, the behavior is undefined.

Call tables may be declared at module scope or local scope, in either the constant or global state\nspace. The .calltargets and .callprototype directives must be declared within a function\nbody. All functions must be declared prior to being referenced in a call table initializer or\n.calltargets directive.

PTX ISA Notes

Direct call introduced in PTX ISA version 1.0. Indirect call introduced in PTX ISA version 2.1.

Target ISA Notes

Direct call supported on all target architectures. Indirect call requires sm_20 or higher.

Examples

// examples of direct call\n    call     init;    // call function 'init'\n    call.uni g, (a);  // call function 'g' with parameter 'a'\n@p  call     (d), h, (a, b);  // return value into register d\n\n// call-via-pointer using jump table\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n.global .u32 jmptbl[5] = { foo, bar, baz };\n      ...\n@p    ld.global.u32  %r0, [jmptbl+4];\n@p    ld.global.u32  %r0, [jmptbl+8];\n      call  (retval), %r0, (x, y), jmptbl;\n\n// call-via-pointer using .calltargets directive\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n      ...\n@p    mov.u32  %r0, foo;\n@q    mov.u32  %r0, baz;\nFtgt: .calltargets foo, bar, baz;\n      call  (retval), %r0, (x, y), Ftgt;\n\n// call-via-pointer using .callprototype directive\n.func dispatch (.reg .u32 fptr, .reg .u32 idx)\n{\n...\nFproto: .callprototype _ (.param .u32 _, .param .u32 _);\n      call  %fptr, (x, y), Fproto;\n...\n

", "tooltip": "Call a function, recording the return location.\n\nSyntax\n\n// direct call to named function, func is a symbol\n\ncall{.uni} (ret-param), func, (param-list);\n\ncall{.uni} func, (param-list);\n\ncall{.uni} func;\n\n// indirect call via pointer, with full list of call targets\n\ncall{.uni} (ret-param), fptr, (param-list), flist;\n\ncall{.uni} fptr, (param-list), flist;\n\ncall{.uni} fptr, flist;\n\n// indirect call via pointer, with no knowledge of call targets\n\ncall{.uni} (ret-param), fptr, (param-list), fproto;\n\ncall{.uni} fptr, (param-list), fproto;\n\ncall{.uni} fptr, fproto;\n\nDescription\n\nThe call instruction stores the address of the next instruction, so execution can resume at that\n\npoint after executing a ret instruction. A call is assumed to be divergent unless the\n\n.uni suffix is present. The .uni suffix indicates that the call is guaranteed to be\n\nnon-divergent, i.e. all active threads in a warp that are currently executing this instruction have\n\nidentical values for the guard predicate and call target.\n\nFor direct calls, the called location func must be a symbolic function name; for indirect calls,\n\nthe called location fptr must be an address of a function held in a register. Input arguments\n\nand return values are optional.\u00a0Arguments may be registers, immediate constants, or variables in\n\n.param space. Arguments are pass-by-value.\n\nIndirect calls require an additional operand, flist or fproto, to communicate the list of\n\npotential call targets or the common function prototype of all call targets,\n\nrespectively. In the first case, flist gives a complete list of potential call targets and\n\nthe optimizing backend is free to optimize the calling convention. In the second case, where the\n\ncomplete list of potential call targets may not be known, the common function prototype is given\n\nand the call must obey the ABI\u2019s calling convention.\n\nThe flist operand is either the name of an array (call table) initialized to a list of function\n\nnames; or a label associated with a .calltargets directive, which declares a list of potential\n\ncall targets. In both cases the fptr register holds the address of a function listed in the call\n\ntable or .calltargets list, and the call operands are type-checked against the type\n\nsignature of the functions indicated by flist.\n\nThe fproto operand is the name of a label associated with a .callprototype directive. This\n\noperand is used when a complete list of potential targets is not known. The call operands are\n\ntype-checked against the prototype, and code generation will follow the ABI calling convention. If a\n\nfunction that doesn\u2019t match the prototype is called, the behavior is undefined.\n\nCall tables may be declared at module scope or local scope, in either the constant or global state\n\nspace. The .calltargets and .callprototype directives must be declared within a function\n\nbody. All functions must be declared prior to being referenced in a call table initializer or\n\n.calltargets directive.\n\nPTX ISA Notes\n\nDirect call introduced in PTX ISA version 1.0. Indirect call introduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nDirect call supported on all target architectures. Indirect call requires sm_20 or higher.\n\nExamples\n\n// examples of direct call\n\n call init; // call function 'init'\n\n call.uni g, (a); // call function 'g' with parameter 'a'\n\n@p call (d), h, (a, b); // return value into register d\n\n// call-via-pointer using jump table\n\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n.global .u32 jmptbl[5] = { foo, bar, baz };\n\n ...\n\n@p ld.global.u32 %r0, [jmptbl+4];\n\n@p ld.global.u32 %r0, [jmptbl+8];\n\n call (retval), %r0, (x, y), jmptbl;\n\n// call-via-pointer using .calltargets directive\n\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n ...\n\n@p mov.u32 %r0, foo;\n\n@q mov.u32 %r0, baz;\n\nFtgt: ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call" }; case "callprototype": return { "html": "For more information, visit callprototype .

Control Flow Directives: .callprototype

\n\n\n

Declare a prototype for use in an indirect call.

Syntax

 // no input or return parameters\nlabel: .callprototype _ .noreturn;\n// input params, no return params\nlabel: .callprototype _ (param-list) .noreturn;\n// no input params, // return params\nlabel: .callprototype (ret-param) _ ;\n// input, return parameters\nlabel: .callprototype (ret-param) _ (param-list);\n

Description

Defines a prototype with no specific function name, and associates the prototype with a label. The\nprototype may then be used in indirect call instructions where there is incomplete knowledge of the\npossible call targets.

Parameters may have either base types in the register or parameter state spaces, or array types in\nparameter state space. The sink symbol '_' may be used to avoid dummy parameter names.

An optional .noreturn directive indicates that the function does not return to the caller\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.

PTX ISA Notes

Introduced in PTX ISA version 2.1.

Support for .noreturn directive introduced in PTX ISA version 6.4.

Target ISA Notes

Requires sm_20 or higher.

.noreturn directive requires sm_30 or higher.

Examples

Fproto1: .callprototype  _ ;\nFproto2: .callprototype  _ (.param .f32 _);\nFproto3: .callprototype  (.param .u32 _) _ ;\nFproto4: .callprototype  (.param .u32 _) _ (.param .f32 _);\n...\n@p   call  (%val), %r0, (%f1), Fproto4;\n...\n\n// example of array parameter\nFproto5: .callprototype _ (.param .b8 _[12]);\n\nFproto6: .callprototype  _ (.param .f32 _) .noreturn;\n...\n@p   call  %r0, (%f1), Fproto6;\n...\n

", "tooltip": "Declare a prototype for use in an indirect call.\n\nSyntax\n\n // no input or return parameters\n\nlabel: .callprototype _ .noreturn;\n\n// input params, no return params\n\nlabel: .callprototype _ (param-list) .noreturn;\n\n// no input params, // return params\n\nlabel: .callprototype (ret-param) _ ;\n\n// input, return parameters\n\nlabel: .callprototype (ret-param) _ (param-list);\n\nDescription\n\nDefines a prototype with no specific function name, and associates the prototype with a label. The\n\nprototype may then be used in indirect call instructions where there is incomplete knowledge of the\n\npossible call targets.\n\nParameters may have either base types in the register or parameter state spaces, or array types in\n\nparameter state space. The sink symbol '_' may be used to avoid dummy parameter names.\n\nAn optional .noreturn directive indicates that the function does not return to the caller\n\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\n\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nSupport for .noreturn directive introduced in PTX ISA version 6.4.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\n.noreturn directive requires sm_30 or higher.\n\nExamples\n\nFproto1: .callprototype _ ;\n\nFproto2: .callprototype _ (.param .f32 _);\n\nFproto3: .callprototype (.param .u32 _) _ ;\n\nFproto4: .callprototype (.param .u32 _) _ (.param .f32 _);\n\n...\n\n@p call (%val), %r0, (%f1), Fproto4;\n\n...\n\n// example of array parameter\n\nFproto5: .callprototype _ (.param .b8 _[12]);\n\nFproto6: .callprototype _ (.param .f32 _) .noreturn;\n\n...\n\n@p call %r0, (%f1), Fproto6;\n\n...\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-callprototype" }; case "calltargets": return { "html": "For more information, visit calltargets .

Control Flow Directives: .calltargets

\n\n\n

Declare a list of potential call targets.

Syntax

Label:   .calltargets  list-of-functions ;\n

Description

Declares a list of potential call targets for a subsequent indirect call, and associates the list\nwith the label at the start of the line.

All functions named in the list must be declared prior to the .calltargets directive, and all\nfunctions must have the same type signature.

PTX ISA Notes

Introduced in PTX ISA version 2.1.

Target ISA Notes

Requires sm_20 or higher.

Examples

calltgt:  .calltargets  fastsin, fastcos;\n...\n@p   call  (%f1), %r0, (%x), calltgt;\n...\n

", "tooltip": "Declare a list of potential call targets.\n\nSyntax\n\nLabel: .calltargets list-of-functions ;\n\nDescription\n\nDeclares a list of potential call targets for a subsequent indirect call, and associates the list\n\nwith the label at the start of the line.\n\nAll functions named in the list must be declared prior to the .calltargets directive, and all\n\nfunctions must have the same type signature.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ncalltgt: .calltargets fastsin, fastcos;\n\n...\n\n@p call (%f1), %r0, (%x), calltgt;\n\n...\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-calltargets" }; case "clock": return { "html": "For more information, visit clock .

Special Registers: %clock, %clock_hi

\n\n\n

%clock: A predefined, read-only 32-bit unsigned cycle counter.
\n
%clock_hi: The upper 32-bits of %clock64 special register.
\n

Syntax (predefined)

.sreg .u32 %clock;\n.sreg .u32 %clock_hi;\n

Description

Special register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\nsilently.

PTX ISA Notes

%clock introduced in PTX ISA version 1.0.

%clock_hi introduced in PTX ISA version 5.0.

Target ISA Notes

%clock supported on all target architectures.

%clock_hi requires sm_20 or higher.

Examples

mov.u32 r1,%clock;\nmov.u32 r2, %clock_hi;\n

", "tooltip": "%clockA predefined, read-only 32-bit unsigned cycle counter.\n\n%clock_hiThe upper 32-bits of %clock64 special register.\n\nSyntax (predefined)\n\n.sreg .u32 %clock;\n\n.sreg .u32 %clock_hi;\n\nDescription\n\nSpecial register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\n\nsilently.\n\nPTX ISA Notes\n\n%clock introduced in PTX ISA version 1.0.\n\n%clock_hi introduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n%clock supported on all target architectures.\n\n%clock_hi requires sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%clock;\n\nmov.u32 r2, %clock_hi;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi" }; case "clock64": return { "html": "For more information, visit clock64 .

Special Registers: %clock64

\n\n\n

A predefined, read-only 64-bit unsigned cycle counter.

Syntax (predefined)

.sreg .u64 %clock64;\n

Description

Special register %clock64 is an unsigned 64-bit read-only cycle counter that wraps silently.

Notes

The lower 32-bits of %clock64 are identical to %clock.

The upper 32-bits of %clock64 are identical to %clock_hi.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%clock64 requires sm_20 or higher.

Examples

mov.u64  r1,%clock64;\n

", "tooltip": "A predefined, read-only 64-bit unsigned cycle counter.\n\nSyntax (predefined)\n\n.sreg .u64 %clock64;\n\nDescription\n\nSpecial register %clock64 is an unsigned 64-bit read-only cycle counter that wraps silently.\n\nNotes\n\nThe lower 32-bits of %clock64 are identical to %clock.\n\nThe upper 32-bits of %clock64 are identical to %clock_hi.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%clock64 requires sm_20 or higher.\n\nExamples\n\nmov.u64 r1,%clock64;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock64" }; case "clock_hi": return { "html": "For more information, visit clock_hi .

Special Registers: %clock, %clock_hi

\n\n\n

%clock: A predefined, read-only 32-bit unsigned cycle counter.
\n
%clock_hi: The upper 32-bits of %clock64 special register.
\n

Syntax (predefined)

.sreg .u32 %clock;\n.sreg .u32 %clock_hi;\n

Description

Special register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\nsilently.

PTX ISA Notes

%clock introduced in PTX ISA version 1.0.

%clock_hi introduced in PTX ISA version 5.0.

Target ISA Notes

%clock supported on all target architectures.

%clock_hi requires sm_20 or higher.

Examples

mov.u32 r1,%clock;\nmov.u32 r2, %clock_hi;\n

", "tooltip": "%clockA predefined, read-only 32-bit unsigned cycle counter.\n\n%clock_hiThe upper 32-bits of %clock64 special register.\n\nSyntax (predefined)\n\n.sreg .u32 %clock;\n\n.sreg .u32 %clock_hi;\n\nDescription\n\nSpecial register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\n\nsilently.\n\nPTX ISA Notes\n\n%clock introduced in PTX ISA version 1.0.\n\n%clock_hi introduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n%clock supported on all target architectures.\n\n%clock_hi requires sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%clock;\n\nmov.u32 r2, %clock_hi;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi" }; case "cluster_ctaid": return { "html": "For more information, visit cluster_ctaid .

Special Registers: %cluster_ctaid

\n\n\n

CTA identifier within a cluster.

Syntax (predefined)

.sreg .v4 .u32 %cluster_ctaid;\n.sreg .u32 %cluster_ctaid.x, %cluster_ctaid.y, %cluster_ctaid.z;\n

Description

A predefined, read-only special register initialized with the CTA identifier in a cluster in each\ndimension. Each CTA in a cluster has a unique CTA identifier.

The %cluster_ctaid special register contains a 1D, 2D, or 3D vector, depending upon the shape of\nthe cluster. The fourth element is unused and always returns zero.

It is guaranteed that:

0  <=  %cluster_ctaid.x <  %cluster_nctaid.x\n0  <=  %cluster_ctaid.y <  %cluster_nctaid.y\n0  <=  %cluster_ctaid.z <  %cluster_nctaid.z\n

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b32 %r<2>;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_ctaid.x;\nmov.u32     %r1, %cluster_ctaid.z;\nmov.v4.u32  %rx, %cluster_ctaid;\n

", "tooltip": "CTA identifier within a cluster.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %cluster_ctaid;\n\n.sreg .u32 %cluster_ctaid.x, %cluster_ctaid.y, %cluster_ctaid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the CTA identifier in a cluster in each\n\ndimension. Each CTA in a cluster has a unique CTA identifier.\n\nThe %cluster_ctaid special register contains a 1D, 2D, or 3D vector, depending upon the shape of\n\nthe cluster. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0 <= %cluster_ctaid.x < %cluster_nctaid.x\n\n0 <= %cluster_ctaid.y < %cluster_nctaid.y\n\n0 <= %cluster_ctaid.z < %cluster_nctaid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32 %r0, %cluster_ctaid.x;\n\nmov.u32 %r1, %cluster_ctaid.z;\n\nmov.v4.u32 %rx, %cluster_ctaid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid" }; case "cluster_ctarank": return { "html": "For more information, visit cluster_ctarank .

Special Registers: %cluster_ctarank

\n\n\n

CTA identifier in a cluster across all dimensions.

Syntax (predefined)

.sreg .u32 %cluster_ctarank;\n

Description

A predefined, read-only special register initialized with the CTA rank within a cluster across all\ndimensions.

It is guaranteed that:

0  <=  %cluster_ctarank <  %cluster_nctarank\n

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b32 %r;\n\nmov.u32  %r, %cluster_ctarank;\n

", "tooltip": "CTA identifier in a cluster across all dimensions.\n\nSyntax (predefined)\n\n.sreg .u32 %cluster_ctarank;\n\nDescription\n\nA predefined, read-only special register initialized with the CTA rank within a cluster across all\n\ndimensions.\n\nIt is guaranteed that:\n\n0 <= %cluster_ctarank < %cluster_nctarank\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r;\n\nmov.u32 %r, %cluster_ctarank;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank" }; case "cluster_nctaid": return { "html": "For more information, visit cluster_nctaid .

Special Registers: %cluster_nctaid

\n\n\n

Number of CTA identifiers per cluster.

Syntax (predefined)

.sreg .v4 .u32 %cluster_nctaid;\n.sreg .u32 %cluster_nctaid.x, %cluster_nctaid.y, %cluster_nctaid.z;\n

Description

A predefined, read-only special register initialized with the number of CTAs in a cluster in each\ndimension.

The %cluster_nctaid special register contains a 3D grid shape vector that holds the cluster\ndimensions in terms of CTAs. The fourth element is unused and always returns zero.

Refer to the Cuda Programming Guide for details on the maximum values of\n%cluster_nctaid.{x,y,z}.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b32 %r<2>;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_nctaid.x;\nmov.u32     %r1, %cluster_nctaid.z;\nmov.v4.u32  %rx, %cluster_nctaid;\n

", "tooltip": "Number of CTA identifiers per cluster.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %cluster_nctaid;\n\n.sreg .u32 %cluster_nctaid.x, %cluster_nctaid.y, %cluster_nctaid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the number of CTAs in a cluster in each\n\ndimension.\n\nThe %cluster_nctaid special register contains a 3D grid shape vector that holds the cluster\n\ndimensions in terms of CTAs. The fourth element is unused and always returns zero.\n\nRefer to the Cuda Programming Guide for details on the maximum values of\n\n%cluster_nctaid.{x,y,z}.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32 %r0, %cluster_nctaid.x;\n\nmov.u32 %r1, %cluster_nctaid.z;\n\nmov.v4.u32 %rx, %cluster_nctaid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid" }; case "cluster_nctarank": return { "html": "For more information, visit cluster_nctarank .

Special Registers: %cluster_nctarank

\n\n\n

Number of CTA identifiers in a cluster across all dimensions.

Syntax (predefined)

.sreg .u32 %cluster_nctarank;\n

Description

A predefined, read-only special register initialized with the nunber of CTAs within a cluster across\nall dimensions.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b32 %r;\n\nmov.u32  %r, %cluster_nctarank;\n

", "tooltip": "Number of CTA identifiers in a cluster across all dimensions.\n\nSyntax (predefined)\n\n.sreg .u32 %cluster_nctarank;\n\nDescription\n\nA predefined, read-only special register initialized with the nunber of CTAs within a cluster across\n\nall dimensions.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r;\n\nmov.u32 %r, %cluster_nctarank;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank" }; case "clusterid": return { "html": "For more information, visit clusterid .

Special Registers: %clusterid

\n\n\n

Cluster identifier within a grid.

Syntax (predefined)

.sreg .v4 .u32 %clusterid;\n.sreg .u32 %clusterid.x, %clusterid.y, %clusterid.z;\n

Description

A predefined, read-only special register initialized with the cluster identifier in a grid in each\ndimension. Each cluster in a grid has a unique identifier.

The %clusterid special register contains a 1D, 2D, or 3D vector, depending upon the shape and\nrank of the cluster. The fourth element is unused and always returns zero.

It is guaranteed that:

0  <=  %clusterid.x <  %nclusterid.x\n0  <=  %clusterid.y <  %nclusterid.y\n0  <=  %clusterid.z <  %nclusterid.z\n

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b32 %r<2>;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %clusterid.x;\nmov.u32     %r1, %clusterid.z;\nmov.v4.u32  %rx, %clusterid;\n

", "tooltip": "Cluster identifier within a grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %clusterid;\n\n.sreg .u32 %clusterid.x, %clusterid.y, %clusterid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the cluster identifier in a grid in each\n\ndimension. Each cluster in a grid has a unique identifier.\n\nThe %clusterid special register contains a 1D, 2D, or 3D vector, depending upon the shape and\n\nrank of the cluster. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0 <= %clusterid.x < %nclusterid.x\n\n0 <= %clusterid.y < %nclusterid.y\n\n0 <= %clusterid.z < %nclusterid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32 %r0, %clusterid.x;\n\nmov.u32 %r1, %clusterid.z;\n\nmov.v4.u32 %rx, %clusterid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid" }; case "clz": return { "html": "For more information, visit clz(int) .

Integer Arithmetic Instructions: clz

\n\n\n

Count leading zeros.

Syntax

clz.type  d, a;\n\n.type = { .b32, .b64 };\n

Description

Count the number of leading zeros in a starting with the most-significant bit and place the\nresult in 32-bit destination register d.\u00a0Operand a has the instruction type, and destination\nd has type .u32. For .b32 type, the number of leading zeros is between 0 and 32,\ninclusively. For.b64 type, the number of leading zeros is between 0 and 64, inclusively.

Semantics

.u32  d = 0;\nif (.type == .b32)   { max = 32; mask = 0x80000000; }\nelse                 { max = 64; mask = 0x8000000000000000; }\n\nwhile (d < max && (a&mask == 0) ) {\n    d++;\n    a = a << 1;\n}\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

clz requires sm_20 or higher.

Examples

clz.b32  d, a;\nclz.b64  cnt, X;  // cnt is .u32\n

", "tooltip": "Count leading zeros.\n\nSyntax\n\nclz.type d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nCount the number of leading zeros in a starting with the most-significant bit and place the\n\nresult in 32-bit destination register d.\u00a0Operand a has the instruction type, and destination\n\nd has type .u32. For .b32 type, the number of leading zeros is between 0 and 32,\n\ninclusively. For.b64 type, the number of leading zeros is between 0 and 64, inclusively.\n\nSemantics\n\n.u32 d = 0;\n\nif (.type == .b32) { max = 32; mask = 0x80000000; }\n\nelse { max = 64; mask = 0x8000000000000000; }\n\nwhile (d < max && (a&mask == 0) ) {\n\n d++;\n\n a = a << 1;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nclz requires sm_20 or higher.\n\nExamples\n\nclz.b32 d, a;\n\nclz.b64 cnt, X; // cnt is .u32\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz" }; case "cnot": return { "html": "For more information, visit cnot .

Logic and Shift Instructions: cnot

\n\n\n

C/C++ style logical negation.

Syntax

cnot.type d, a;\n\n.type = { .b16, .b32, .b64 };\n

Description

Compute the logical negation using C/C++ semantics.

Semantics

d = (a==0) ? 1 : 0;\n

Notes

The size of the operands must match, but not necessarily the type.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

cnot.b32 d,a;\n

", "tooltip": "C/C++ style logical negation.\n\nSyntax\n\ncnot.type d, a;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nCompute the logical negation using C/C++ semantics.\n\nSemantics\n\nd = (a==0) ? 1 : 0;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ncnot.b32 d,a;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot" }; case "common": return { "html": "For more information, visit common .

Linking Directives: .common

\n\n\n

Visible (externally) symbol declaration.

Syntax

.common identifier\n

Description

Declares identifier to be globally visible but \u201ccommon\u201d.

Common symbols are similar to globally visible symbols. However multiple object files may declare\nthe same common symbol and they may have different types and sizes and references to a symbol get\nresolved against a common symbol with the largest size.

Only one object file can initialize a common symbol and that must have the largest size among all\nother definitions of that common symbol from different object files.

.common linking directive can be used only on variables with .global storage. It cannot be\nused on function symbols or on symbols with opaque type.

PTX ISA Notes

Introduced in PTX ISA version 5.0.

Target ISA Notes

.common directive requires sm_20 or higher.

Examples

.common .global .u32 gbl;\n

", "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.common identifier\n\nDescription\n\nDeclares identifier to be globally visible but \u201ccommon\u201d.\n\nCommon symbols are similar to globally visible symbols. However multiple object files may declare\n\nthe same common symbol and they may have different types and sizes and references to a symbol get\n\nresolved against a common symbol with the largest size.\n\nOnly one object file can initialize a common symbol and that must have the largest size among all\n\nother definitions of that common symbol from different object files.\n\n.common linking directive can be used only on variables with .global storage. It cannot be\n\nused on function symbols or on symbols with opaque type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n.common directive requires sm_20 or higher.\n\nExamples\n\n.common .global .u32 gbl;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-common" }; case "copysign": return { "html": "For more information, visit copysign(fp) .

Floating Point Instructions: copysign

\n\n\n

Copy sign of one input to another.

Syntax

copysign.type  d, a, b;\n\n.type = { .f32, .f64 };\n

Description

Copy sign bit of a into value of b, and return the result as d.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

Requires sm_20 or higher.

Examples

copysign.f32  x, y, z;\ncopysign.f64  A, B, C;\n

", "tooltip": "Copy sign of one input to another.\n\nSyntax\n\ncopysign.type d, a, b;\n\n.type = { .f32, .f64 };\n\nDescription\n\nCopy sign bit of a into value of b, and return the result as d.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ncopysign.f32 x, y, z;\n\ncopysign.f64 A, B, C;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign" }; case "cos": return { "html": "For more information, visit cos(fp) .

Floating Point Instructions: cos

\n\n\n

Find the cosine of a value.

Syntax

cos.approx{.ftz}.f32  d, a;\n

Description

Find the cosine of the angle a (in radians).

Semantics

d = cos(a);\n

Notes

cos.approx.f32 implements a fast approximation to cosine.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	NaN
-subnormal	+1.0
-0.0	+1.0
+0.0	+1.0
+subnormal	+1.0
+Inf	NaN
NaN	NaN

The maximum absolute error is 2^-20.9 in quadrant 00.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

cos.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

Subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

cos.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\nintroduced in PTX ISA version 1.4.

For PTX ISA version 1.4 and later, the .approx modifier is required.

For PTX ISA versions 1.0 through 1.3, cos.f32 defaults to cos.approx.ftz.f32.

Target ISA Notes

Supported on all target architectures.

Examples

cos.approx.ftz.f32  ca, a;\n

", "tooltip": "Find the cosine of a value.\n\nSyntax\n\ncos.approx{.ftz}.f32 d, a;\n\nDescription\n\nFind the cosine of the angle a (in radians).\n\nSemantics\n\nd = cos(a);\n\nNotes\n\ncos.approx.f32 implements a fast approximation to cosine.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-subnormal\n\n+1.0\n\n-0.0\n\n+1.0\n\n+0.0\n\n+1.0\n\n+subnormal\n\n+1.0\n\n+Inf\n\nNaN\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-20.9 in quadrant 00.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\ncos.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\ncos.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, cos.f32 defaults to cos.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ncos.approx.ftz.f32 ca, a;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos" }; case "cp": return { "html": "For more information, visit cp.async , cp.async.bulk , cp.async.bulk.commit_group , cp.async.bulk.prefetch , cp.async.bulk.prefetch.tensor , cp.async.bulk.tensor , cp.async.bulk.wait_group , cp.async.commit_group , cp.async.mbarrier.arrive , cp.async.wait_all , cp.async.wait_group , cp.reduce.async.bulk , cp.reduce.async.bulk.tensor .

Data Movement and Conversion Instructions: cp.async

\n\n\n

Initiates an asynchronous copy operation from one state space to another.

Syntax

cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], cp-size{, src-size}{, cache-policy} ;\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], 16{, src-size}{, cache-policy} ;\ncp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], cp-size{, ignore-src}{, cache-policy} ;\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], 16{, ignore-src}{, cache-policy} ;\n\n.level::cache_hint =     { .L2::cache_hint }\n.level::prefetch_size =  { .L2::64B, .L2::128B, .L2::256B }\ncp-size =                { 4, 8, 16 }\n

Description

cp.async is a non-blocking instruction which initiates an asynchronous copy operation of data\nfrom the location specified by source address operand src to the location specified by\ndestination address operand dst. Operand src specifies a location in the global state space\nand dst specifies a location in the shared state space.

Operand cp-size is an integer constant which specifies the size of data in bytes to be copied to\nthe destination dst. cp-size can only be 4, 8 and 16.

Instruction cp.async allows optionally specifying a 32-bit integer operand src-size. Operand\nsrc-size represents the size of the data in bytes to be copied from src to dst and must\nbe less than cp-size. In such case, remaining bytes in destination dst are filled with\nzeros. Specifying src-size larger than cp-size results in undefined behavior.

The optional and non-immediate predicate argument ignore-src specifies whether the data from the\nsource location src should be ignored completely. If the source data is ignored then zeros will\nbe copied to destination dst. If the argument ignore-src is not specified then it defaults\nto False.

Supported alignment requirements and addressing modes for operand src and dst are described\nin Addresses as Operands.

The mandatory .async qualifier indicates that the cp instruction will initiate the memory\ncopy operation asynchronously and control will return to the executing thread before the copy\noperation is complete. The executing thread can then use cp.async.wait_all or\ncp.async.wait_group or mbarrier instructions to wait for\ncompletion of the asynchronous copy operation. No other synchronization mechanisms described in\nMemory Consistency Model can be used to guarantee the\ncompletion of the asynchronous copy operations.

There is no ordering guarantee between two cp.async operations if they are not explicitly\nsynchronized using cp.async.wait_all or cp.async.wait_group or mbarrier instructions.

As described in Cache Operators, the .cg qualifier indicates\ncaching of data only at global level cache L2 and not at L1 whereas .ca qualifier indicates\ncaching of data at all levels including L1 cache. Cache operator are treated as performance hints\nonly.

cp.async is treated as a weak memory operation in the Memory Consistency Model.

The .level::prefetch_size qualifier is a hint to fetch additional data of the specified size\ninto the respective cache level.The sub-qualifier prefetch_size can be set to either of 64B,\n128B, 256B thereby allowing the prefetch size to be 64 Bytes, 128 Bytes or 256 Bytes\nrespectively.

The qualifier .level::prefetch_size may only be used with .global state space and with\ngeneric addressing where the address points to .global state space. If the generic address does\nnot fall within the address window of the global memory, then the prefetching behavior is undefined.

The .level::prefetch_size qualifier is treated as a performance hint only.

The qualifier .level::cache_hint is only supported for .global state space and for generic\naddressing where the address points to the .global state space.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for .level::cache_hint and .level::prefetch_size qualifiers introduced in PTX ISA\nversion 7.4.

Support for ignore-src operand introduced in PTX ISA version 7.5.

Support for sub-qualifier ::cta introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Examples

cp.async.ca.shared.global  [shrd],    [gbl + 4], 4;\ncp.async.ca.shared::cta.global  [%r0 + 8], [%r1],     8;\ncp.async.cg.shared.global  [%r2],     [%r3],     16;\n\ncp.async.cg.shared.global.L2::64B   [%r2],      [%r3],     16;\ncp.async.cg.shared.global.L2::128B  [%r0 + 16], [%r1],      8;\ncp.async.cg.shared.global.L2::256B  [%r2 + 32], [%r3],     16;\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache-policy, 0.25;\ncp.async.ca.shared.global.L2::cache_hint [%r2], [%r1], 4, cache-policy;\n\ncp.async.ca.shared.global                   [shrd], [gbl], 4, p;\ncp.async.cg.shared.global.L2::chache_hint   [%r0], [%r2], 16, q, cache-policy;\n

Data Movement and Conversion Instructions: cp.async.bulk

\n\n\n

Initiates an asynchronous copy operation from one state space to another.

Syntax

cp.async.bulk.dst.src.completion_mechanism{.multicast}{.level::cache_hint}\n                      [dstMem], [srcMem], size, [mbar] {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n.src =                  { .global }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.level::cache_hint =    { .L2::cache_hint }\n.multicast =            { .multicast::cluster  }\n\n\ncp.async.bulk.dst.src.completion_mechanism [dstMem], [srcMem], size, [mbar]\n\n.dst =                  { .shared::cluster }\n.src =                  { .shared::cta }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n\n\ncp.async.bulk.dst.src.completion_mechanism{.level::cache_hint} [dstMem], [srcMem], size{, cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.level::cache_hint =    { .L2::cache_hint }\n

Description

cp.async.bulk is a non-blocking instruction which initiates an asynchronous bulk-copy operation\nfrom the location specified by source address operand srcMem to the location specified by\ndestination address operand dstMem.

The direction of bulk-copy is from the state space specified by the .src modifier to the state\nspace specified by the .dst modifiers.

The 32-bit operand size specifies the amount of memory to be copied, in terms of number of\nbytes. size must be a multiple of 16. If the value is not a multiple of 16, then the behavior is\nundefined. The memory range [dstMem, dstMem + size - 1] must not overflow the destination memory\nspace and the memory range [srcMem, srcMem + size - 1] must not overflow the source memory\nspace. Otherwise, the behavior is undefined. The addresses dstMem and srcMem must be aligned\nto 16 bytes.

When the source of the copy is .shared::cta and the destination is .shared::cluster, the\ndestination has to be in the shared memory of a different CTA within the cluster.

The modifier .completion_mechanism specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Completion mechanism	`.dst`	`.src`	Description
`.mbarrier::...`	`.shared::cluster`	`.global`	mbarrier based completion mechanism
`.shared::cluster`	`.shared::cta`
`.bulk_group`	`.global`	`.shared::cta`	Bulk async-group based completion mechanism

The modifier .mbarrier::complete_tx::bytes specifies that the cp.async.bulk variant uses\nmbarrier based completion mechanism. The complete-tx\noperation, with completeCount argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand mbar.

The modifier .bulk_group specifies that the cp.async.bulk variant uses bulk async-group\nbased completion mechanism.

The optional modifier .multicast::cluster allows copying of data from global memory to shared\nmemory of multiple CTAs in the cluster. Operand ctaMask specifies the destination CTAs in the\ncluster such that each bit position in the 16-bit ctaMask operand corresponds to the %ctaid\nof the destination CTA. The source data is multicast to the same CTA-relative offset as dstMem\nin the shared memory of each destination CTA. The mbarrier signal is also multicast to the same\nCTA-relative offset as mbar in the shared memory of the destination CTA.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier .level::cache_hint is only supported when at least one of the .src or .dst\nstatespaces is .global state space.

The copy operation in cp.async.bulk is treated as a weak memory operation and the complete-tx\noperation on the mbarrier has .release semantics at the .cluster scope as described in the\nMemory Consistency Model.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

// .global -> .shared::cluster:\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar];\n\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster\n                                             [dstMem], [srcMem], size, [mbar], ctaMask;\n\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint\n                                             [dstMem], [srcMem], size, [mbar], cache-policy;\n\n\n// .shared::cta -> .shared::cluster (strictly remote):\ncp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar];\n\n// .shared::cta -> .global:\ncp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size;\n\ncp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint} [dstMem], [srcMem], size, cache-policy;\n

Data Movement and Conversion Instructions: cp.async.bulk.commit_group

\n\n\n

Commits all prior initiated but uncommitted cp.async.bulk instructions into a\ncp.async.bulk-group.

Syntax

cp.async.bulk.commit_group;\n

Description

cp.async.bulk.commit_group instruction creates a new per-thread bulk async-group and batches\nall prior cp{.reduce}.async.bulk.{.prefetch}{.tensor} instructions satisfying the following\nconditions into the new bulk async-group:

The prior cp{.reduce}.async.bulk.{.prefetch}{.tensor} instructions use bulk_group based\ncompletion mechanism, and
They are initiated by the executing thread but not committed to any bulk async-group.

If there are no uncommitted cp{.reduce}.async.bulk.{.prefetch}{.tensor} instructions then\ncp.async.bulk.commit_group results in an empty bulk async-group.

An executing thread can wait for the completion of all\ncp{.reduce}.async.bulk.{.prefetch}{.tensor} operations in a bulk async-group using\ncp.async.wait_group.

There is no memory ordering guarantee provided between any two\ncp{.reduce}.async.bulk.{.prefetch}{.tensor} operations within the same bulk async-group.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

cp.async.bulk.commit_group;\n

Data Movement and Conversion Instructions: cp.async.bulk.prefetch

\n\n\n

Provides a hint to the system to initiate the asynchronous prefetch of data to the cache.

Syntax

cp.async.bulk.prefetch.L2.src{.level::cache_hint}   [srcMem], size {, cache-policy}\n\n.src =                { .global }\n.level::cache_hint =  { .L2::cache_hint }\n

Description

cp.async.bulk.prefetch is a non-blocking instruction which may initiate an asynchronous prefetch\nof data from the location specified by source address operand srcMem, in .src statespace, to\nthe L2 cache.

The 32-bit operand size specifies the amount of memory to be prefetched in terms of number of\nbytes. size must be a multiple of 16. If the value is not a multiple of 16, then the behavior is\nundefined. The memory range [dstMem, dstMem + size - 1] must not overflow the destination memory\nspace and the memory range [srcMem, srcMem + size - 1] must not overflow the source memory\nspace. Otherwise, the behavior is undefined. The address srcMem must be aligned to 16 bytes.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

cp.async.bulk.prefetch.L2.global                 [srcMem], size;\n\ncp.async.bulk.prefetch.L2.global.L2::cache_hint  [srcMem], size, policy;\n

Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor

\n\n\n

Provides a hint to the system to initiate the asynchronous prefetch of tensor data to the cache.

Syntax

// global -> shared::cluster:\ncp.async.bulk.prefetch.tensor.dim.L2.src{.load_mode}{.level::cache_hint} [tensorMap, tensorCoords]\n                                                             {, im2colOffsets } {, cache-policy}\n\n.src =                { .global }\n.dim =                { .1d, .2d, .3d, .4d, .5d }\n.load_mode =          { .tile, .im2col }\n.level::cache_hint =  { .L2::cache_hint }\n

Description

cp.async.bulk.prefetch.tensor is a non-blocking instruction which may initiate an asynchronous\nprefetch of tensor data from the location in .src statespace to the L2 cache.

The operand tensorMap is the generic address of the opaque tensor-map object which resides\neither in .param space or .const space. The operand tensorMap specifies the properties\nof the tensor copy operation, as described in Tensor-map. Refer to\nthe CUDA programming guide for creating the tensor-map objects on the host side.

The dimension of the tensor data is specified by the .dim modifier.

The vector operand tensorCoords specifies the starting coordinates in the tensor data in the\nglobal memory from or to which the copy operation has to be performed. The number of tensor\ncoordinates in the vector argument tensorCoords should be equal to the dimension specified by\nthe modifier .dim. The individual tensor coordinates in tensorCoords are of type .s32.

The qualifier .load_mode specifies how the data in the source location is copied into the\ndestination location. If .load_mode is not specified, it defaults to .tile. In .tile\nmode, the multi-dimensional layout of the source tensor is preserved at the destination. In\n.im2col mode, some dimensions of the source tensors are unrolled in a single dimensional column\nat the destination. Details of the im2col mode are described in Im2col mode. In .im2col mode, the tensor has to be at least\n3-dimensional. The vector operand im2colOffsets can be specified only when .load_mode is\n.im2col. The length of the vector operand im2colOffsets is two less than the number of dimension\n.dim of the tensor operation.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

cp.async.bulk.prefetch.tensor is treated as a weak memory operation in the Memory Consistency\nModel.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b16 ctaMask;\n.reg .u16 i2cOffW, i2cOffH, i2cOffD;\n.reg .b64 l2CachePolicy;\n\ncp.async.bulk.prefetch.tensor.1d.L2.global.tile  [tensorMap0, {tc0}];\n\n@p cp.async.bulk.prefetch.tensor.2d.L2.global    [tensorMap1, {tc0, tc1}];\n\n@p cp.async.bulk.prefetch.tensor.5d.L2.global.im2col\n                      [tensorMap2, {tc0, tc1, tc2, tc3, tc4}], {i2cOffW, i2cOffH, i2cOffD};\n\n@p cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint\n                      [tensorMap3, {tc0, tc1, tc2}], {i2cOffW}, policy;\n

Data Movement and Conversion Instructions: cp.async.bulk.tensor

\n\n\n

Initiates an asynchronous copy operation on the tensor data from one state space to another.

Syntax

// global -> shared::cluster:\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.level::cache_hint}\n                                   [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colOffsets}\n                                   {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n.src =                  { .global }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.load_mode =            { .tile, .im2col }\n.level::cache_hint =    { .L2::cache_hint }\n.multicast =            { .multicast::cluster  }\n\n\n// shared::cta -> global:\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.level::cache_hint}\n                                   [tensorMap, tensorCoords], [srcMem] {, cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .bulk_group }\n.load_mode =            { .tile, .im2col_no_offs }\n.level::cache_hint =    { .L2::cache_hint }\n

Description

cp.async.bulk.tensor is a non-blocking instruction which initiates an asynchronous copy\noperation of tensor data from the location in .src state space to the location in the .dst\nstate space.

The operand dstMem specifies the location in the .dst state space into which the tensor data\nhas to be copied and srcMem specifies the location in the .src state space from which the\ntensor data has to be copied.

The dimension of the tensor data is specified by the .dim modifier.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Completion mechanism	`.dst`	`.src`	Description
`.mbarrier::...`	`.shared::cluster`	`.global`	mbarrier based completion mechanism
`.shared::cluster`	`.shared::cta`
`.bulk_group`	`.global`	`.shared::cta`	Bulk async-group based completion mechanism

The modifier .mbarrier::complete_tx::bytes specifies that the cp.async.bulk.tensor variant\nuses mbarrier based completion mechanism. The complete-tx\noperation, with completeCount argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand mbar.

The modifier .bulk_group specifies that the cp.async.bulk.tensor variant uses bulk\nasync-group based completion mechanism.

The optional modifier .multicast::cluster allows copying of data from global memory to shared\nmemory of multiple CTAs in the cluster. Operand ctaMask specifies the destination CTAs in the\ncluster such that each bit position in the 16-bit ctaMask operand corresponds to the %ctaid\nof the destination CTA. The source data is multicast to the same offset as dstMem in the shared\nmemory of each destination CTA. The mbarrier signal is also multicast to the same offset as mbar\nin the shared memory of the destination CTA.

The copy operation in cp.async.bulk.tensor is treated as a weak memory operation and the\ncomplete-tx\noperation on the mbarrier has .release semantics at the .cluster scope as described in the\nMemory Consistency Model.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b16 ctaMask;\n.reg .u16 i2cOffW, i2cOffH, i2cOffD;\n.reg .b64 l2CachePolicy;\n\ncp.async.bulk.tensor.1d.shared::cluster.global.tile  [sMem0], [tensorMap0, {tc0}], [mbar0];\n\n@p cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster\n                     [sMem1], [tensorMap1, {tc0, tc1}], [mbar2], ctaMask;\n\n@p cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes\n                     [sMem2], [tensorMap2, {tc0, tc1, tc2, tc3, tc4}], [mbar2], {i2cOffW, i2cOffH, i2cOffD};\n\n@p cp.async.bulk.tensor.3d.im2col.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint\n                     [sMem3], [tensorMap3, {tc0, tc1, tc2}], [mbar3], {i2cOffW}, policy;\n\n@p cp.async.bulk.tensor.1d.global.shared::cta.bulk_group  [tensorMap3, {tc0}], [sMem3];\n

Data Movement and Conversion Instructions: cp.async.bulk.wait_group

\n\n\n

Wait for completion of bulk async-groups.

Syntax

cp.async.bulk.wait_group{.read} N;\n

Description

cp.async.bulk.wait_group instruction will cause the executing thread to wait until only N or\nfewer of the most recent bulk async-groups are pending and all the prior bulk async-groups\ncommitted by the executing threads are complete. For example, when N is 0, the executing thread\nwaits on all the prior bulk async-groups to complete. Operand N is an integer constant.

By default, cp.async.bulk.wait_group instruction will cause the executing thread to wait till\nall the bulk async operations in the specified bulk async-group have completed all of the\nfollowing:

Reading from the source locations.
Writing to their respective destination locations.
Writes being made visible to the executing thread.

The optional .read modifier indicates that the waiting has to be done until all the bulk async\noperations in the specified bulk async-group have completed reading from their source locations.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

cp.async.bulk.wait_group.read   0;\ncp.async.bulk.wait_group        2;\n

Data Movement and Conversion Instructions: cp.async.commit_group

\n\n\n

Commits all prior initiated but uncommitted cp.async instructions into a cp.async-group.

Syntax

cp.async.commit_group ;\n

Description

cp.async.commit_group instruction creates a new cp.async-group per thread and batches all\nprior cp.async instructions initiated by the executing thread but not committed to any\ncp.async-group into the new cp.async-group. If there are no uncommitted cp.async\ninstructions then cp.async.commit_group results in an empty cp.async-group.

An executing thread can wait for the completion of all cp.async operations in a cp.async-group\nusing cp.async.wait_group.

There is no memory ordering guarantee provided between any two cp.async operations within the\nsame cp.async-group. So two or more cp.async operations within a cp.async-group copying data\nto the same location results in undefined behavior.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_80 or higher.

Examples

// Example 1:\ncp.async.ca.shared.global [shrd], [gbl], 4;\ncp.async.commit_group ; // Marks the end of a cp.async group\n\n// Example 2:\ncp.async.ca.shared.global [shrd1],   [gbl1],   8;\ncp.async.cg.shared.global [shrd1+8], [gbl1+8], 8;\ncp.async.commit_group ; // Marks the end of cp.async group 1\n\ncp.async.ca.shared.global [shrd2],    [gbl2],    16;\ncp.async.cg.shared.global [shrd2+16], [gbl2+16], 16;\ncp.async.commit_group ; // Marks the end of cp.async group 2\n

Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive

\n\n\n

Makes the mbarrier object track all prior cp.async operations initiated by the\nexecuting thread.

Syntax

cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr];\n

Description

Causes an arrive-on operation to be\ntriggered by the system on the mbarrier object upon the completion of all prior cp.async operations initiated by the\nexecuting thread. The mbarrier object is at the location specified by the operand addr. The\narrive-on operation is\nasynchronous to execution of cp.async.mbarrier.arrive.

When .noinc modifier is not specified, the pending count of the mbarrier object is incremented\nby 1 prior to the asynchronous arrive-on operation. This\nresults in a zero-net change for the pending count from the asynchronous arrive-on operation\nduring the current phase. The pending count of the mbarrier object after the increment should not\nexceed the limit as mentioned in Contents of the mbarrier object. Otherwise,\nthe behavior is undefined.

When the .noinc modifier is specified, the increment to the pending count of the mbarrier\nobject is not performed. Hence the decrement of the pending count done by the asynchronous\narrive-on operation must be\naccounted for in the initialization of the mbarrier object.

If no state space is specified then Generic Addressing is\nused. If the address specified by addr does not fall within the address window of\n.shared::cta state space then the behavior is undefined.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Examples

// Example 1: no .noinc\nmbarrier.init.shared.b64 [shMem], threadCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n....\n// Absence of .noinc accounts for arrive-on from completion of prior cp.async operations.\n// So mbarrier.init must only account for arrive-on from mbarrier.arrive.\ncp.async.mbarrier.arrive.shared.b64 [shMem];\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n\n\n\n// Example 2: with .noinc\n\n// Tracks arrive-on from mbarrier.arrive and cp.async.mbarrier.arrive.\n\n// All threads participating in the mbarrier perform cp.async\nmov.b32 copyOperationCnt, threadCount;\n\n// 3 arrive-on operations will be triggered per-thread\nmul.lo.u32 copyArrivalCnt, copyOperationCnt, 3;\n\nadd.u32 totalCount, threadCount, copyArrivalCnt;\n\nmbarrier.init.shared.b64 [shMem], totalCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n...\n// Presence of .noinc requires mbarrier initalization to have accounted for arrive-on from cp.async\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 1st instance\n....\ncp.async.ca.shared.global [shard3], [gbl3], 4;\ncp.async.ca.shared.global [shard4], [gbl4], 16;\ncp.async.mbarrier.arrive.noinc.shared::cta.b64 [shMem]; // 2nd instance\n....\ncp.async.ca.shared.global [shard5], [gbl5], 4;\ncp.async.cg.shared.global [shard6], [gbl6], 16;\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 3rd and last instance\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n

Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all

\n\n\n

Wait for completion of prior asynchronous copy operations.

Syntax

cp.async.wait_group N;\ncp.async.wait_all ;\n

Description

cp.async.wait_group instruction will cause executing thread to wait till only N or fewer of\nthe most recent cp.async-groups are pending and all the prior cp.async-groups committed by\nthe executing threads are complete. For example, when N is 0, the executing thread waits on all\nthe prior cp.async-groups to complete. Operand N is an integer constant.

cp.async.wait_all is equivalent to :

cp.async.commit_group;\ncp.async.wait_group 0;\n

An empty cp.async-group is considered to be trivially complete.

Writes performed by cp.async operations are made visible to the executing thread only after:

The completion of cp.async.wait_all or
The completion of cp.async.wait_group on the cp.async-group in which the cp.async\nbelongs to or
mbarrier.test_wait\nreturns True on an mbarrier object which is tracking the completion of the cp.async\noperation.

There is no ordering between two cp.async operations that are not synchronized with\ncp.async.wait_all or cp.async.wait_group or mbarrier objects.

cp.async.wait_group and cp.async.wait_all does not provide any ordering and visibility\nguarantees for any other memory operation apart from cp.async.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_80 or higher.

Examples

// Example of .wait_all:\ncp.async.ca.shared.global [shrd1], [gbl1], 4;\ncp.async.cg.shared.global [shrd2], [gbl2], 16;\ncp.async.wait_all;  // waits for all prior cp.async to complete\n\n// Example of .wait_group :\ncp.async.ca.shared.global [shrd3], [gbl3], 8;\ncp.async.commit_group;  // End of group 1\n\ncp.async.cg.shared.global [shrd4], [gbl4], 16;\ncp.async.commit_group;  // End of group 2\n\ncp.async.cg.shared.global [shrd5], [gbl5], 16;\ncp.async.commit_group;  // End of group 3\n\ncp.async.wait_group 1;  // waits for group 1 and group 2 to complete\n

Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all

\n\n\n

Wait for completion of prior asynchronous copy operations.

Syntax

cp.async.wait_group N;\ncp.async.wait_all ;\n

Description

cp.async.wait_all is equivalent to :

cp.async.commit_group;\ncp.async.wait_group 0;\n

An empty cp.async-group is considered to be trivially complete.

Writes performed by cp.async operations are made visible to the executing thread only after:

The completion of cp.async.wait_all or
The completion of cp.async.wait_group on the cp.async-group in which the cp.async\nbelongs to or
mbarrier.test_wait\nreturns True on an mbarrier object which is tracking the completion of the cp.async\noperation.

There is no ordering between two cp.async operations that are not synchronized with\ncp.async.wait_all or cp.async.wait_group or mbarrier objects.

cp.async.wait_group and cp.async.wait_all does not provide any ordering and visibility\nguarantees for any other memory operation apart from cp.async.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_80 or higher.

Examples

// Example of .wait_all:\ncp.async.ca.shared.global [shrd1], [gbl1], 4;\ncp.async.cg.shared.global [shrd2], [gbl2], 16;\ncp.async.wait_all;  // waits for all prior cp.async to complete\n\n// Example of .wait_group :\ncp.async.ca.shared.global [shrd3], [gbl3], 8;\ncp.async.commit_group;  // End of group 1\n\ncp.async.cg.shared.global [shrd4], [gbl4], 16;\ncp.async.commit_group;  // End of group 2\n\ncp.async.cg.shared.global [shrd5], [gbl5], 16;\ncp.async.commit_group;  // End of group 3\n\ncp.async.wait_group 1;  // waits for group 1 and group 2 to complete\n

Data Movement and Conversion Instructions: cp.reduce.async.bulk

\n\n\n

Initiates an asynchronous reduction operation.

Syntax

cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type\n              [dstMem], [srcMem], size, [mbar]\n\n.dst =                  { .shared::cluster }\n.src =                  { .shared::cta }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.redOp=                 { .and, .or, .xor,\n                          .add, .inc, .dec,\n                          .min, .max }\n.type =                 { .b32, .u32, .s32, .b64, .u64 }\n\n\ncp.reduce.async.bulk.dst.src.completion_mechanism{.level::cache_hint}.redOp.type\n               [dstMem], [srcMem], size{, cache-policy}\n\n.dst =                  { .global      }\n.src =                  { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.level::cache_hint    = { .L2::cache_hint }\n.redOp=                 { .and, .or, .xor,\n                          .add, .inc, .dec,\n                          .min, .max }\n.type =                 { .f16, .bf16, .b32, .u32, .s32, .b64, .u64, .s64, .f32, .f64 }\n\n\ncp.reduce.async.bulk.dst.src.completion_mechanism{.level::cache_hint}.add.noftz.type\n               [dstMem], [srcMem], size{, cache-policy}\n.dst  =                 { .global }\n.src  =                 { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.type =                 { .f16, .bf16 }\n

Description

cp.reduce.async.bulk is a non-blocking instruction which initiates an asynchronous reduction\noperation on an array of memory locations specified by the destination address operand dstMem\nwith the source array whose location is specified by the source address operand srcMem. The size\nof the source and the destination array must be the same and is specified by the operand size.

Each data element in the destination array is reduced inline with the corresponding data element in\nthe source array with the reduction operation specified by the modifier .redOp. The type of each\ndata element in the source and the destination array is specified by the modifier .type.

The source address operand srcMem is located in the state space specified by .src and the\ndestination address operand dstMem is located in the state specified by the .dst.

The 32-bit operand size specifies the amount of memory to be copied from the source location and\nused in the reduction operation, in terms of number of bytes. size must be a multiple of 16. If\nthe value is not a multiple of 16, then the behavior is undefined. The memory range [dstMem,\ndstMem + size - 1] must not overflow the destination memory space and the memory range [srcMem,\nsrcMem + size - 1] must not overflow the source memory space. Otherwise, the behavior is\nundefined. The addresses dstMem and srcMem must be aligned to 16 bytes.

The operations supported by .redOp are classified as follows:

The bit-size operations are .and, .or, and .xor.
The integer operations are .add, .inc, .dec, .min, and .max. The .inc and\n.dec operations return a result in the range [0..x] where x is the value at the source\nstate space.
The floating point operation .add rounds to the nearest even. The current implementation of\ncp.reduce.async.bulk.add.f32 flushes subnormal inputs and results to sign-preserving zero. The\ncp.reduce.async.bulk.add.f16 and cp.reduce.async.bulk.add.bf16 operations require\n.noftz qualifier. It preserves input and result subnormals, and does not flush them to zero.

The following table describes the valid combinations of .redOp and element type:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

`.dst`	`.redOp`	Element type
`.shared::cluster`	`.add`	`.u32`, `.s32`, `.u64`
`.min`, `.max`	`.u32`, `.s32`
`.inc`, `.dec`	`.u32`
`.and`, `.or`, `.xor`	`.b32`
`.global`	`.add`	`.u32`, `.s32`, `.u64`, `.f32`, `.f64`, `.f16`, `.bf16`
`.min`, `.max`	`.u32`, `.s32`, `.u64`, `.s64`, `.f16`, `.bf16`
`.inc`, `.dec`	`.u32`
`.and`, `.or`, `.xor`	`.b32`, `.b64`

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Completion mechanism	`.dst`	`.src`	Description
`.mbarrier::...`	`.shared::cluster`	`.global`	mbarrier based completion mechanism
`.shared::cluster`	`.shared::cta`
`.bulk_group`	`.global`	`.shared::cta`	Bulk async-group based completion mechanism

The modifier .mbarrier::complete_tx::bytes specifies that the cp.reduce.async.bulk variant\nuses mbarrier based completion mechanism. The complete-tx\noperation, with completeCount argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand mbar.

The modifier .bulk_group specifies that the cp.reduce.async.bulk variant uses bulk\nasync-group based completion mechanism.

Each reduction operation performed by the cp.reduce.async.bulk has individually .relaxed.gpu\nmemory ordering semantics. The load operations in cp.reduce.async.bulk are treated as weak\nmemory operation and the complete-tx\noperation on the mbarrier has .release semantics at the .cluster scope as described in the\nMemory Consistency Model.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64\n                                                                  [dstMem], [srcMem], size, [mbar];\n\ncp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32\n                                                                  [dstMem], [srcMem], size, [mbar];\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size;\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.xor.s32 [dstMem], [srcMem], size, policy;\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size;\n

Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor

\n\n\n

Initiates an asynchronous reduction operation on the tensor data.

Syntax

// shared::cta -> global:\ncp.reduce.async.bulk.tensor.dim.dst.src.redOp{.load_mode}.completion_mechanism{.level::cache_hint}\n                                          [tensorMap, tensorCoords], [srcMem] {,cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .bulk_group }\n.load_mode =            { .tile, .im2col_no_offs }\n.redOp =                { .add, .min, .max, .inc, .dec, .and, .or, .xor}\n

Description

cp.reduce.async.bulk.tensor is a non-blocking instruction which initiates an asynchronous\nreduction operation of tensor data in the .dst state space with tensor data in the .src\nstate space.

The operand srcMem specifies the location of the tensor data in the .src state space using\nwhich the reduction operation has to be performed.

The operand tensorMap is the generic address of the opaque tensor-map object which resides\neither in .param space or .const space. The operand tensorMap specifies the properties\nof the tensor reduce operation, as described in Tensor-map. Refer\nto the CUDA programming guide for creating the tensor-map objects on the host side.

Each element of the tensor data in the .dst state space is reduced inline with the corresponding\nelement from the tensor data in the .src state space. The modifier .redOp specifies the\nreduction operation used for the inline reduction. The type of each tensor data element in the\nsource and the destination tensor is specified in Tensor-map.

The dimension of the tensor is specified by the .dim modifier.

The vector operand tensorCoords specifies the starting coordinates of the tensor data in the\nglobal memory on which the reduce operation is to be performed. The number of tensor coordinates in\nthe vector argument tensorCoords should be equal to the dimension specified by the modifier\n.dim. The individual tensor coordinates are of the type .s32.

The following table describes the valid combinations of .redOp and element type:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

`.redOp`	Element type
`.add`	`.u32`, `.s32`, `.u64`, `.f32`, `.f16`, `.bf16`
`.min`, `.max`	`.u32`, `.s32`, `.u64`, `.s64`, `.f16`, `.bf16`
`.inc`, `.dec`	`.u32`
`.and`, `.or`, `.xor`	`.b32`, `.b64`

The modifier .completion_mechanism specifies the completion mechanism that is supported on the\ninstruction variant. Value .bulk_group of the modifier .completion_mechanism specifies that\ncp.reduce.async.bulk.tensor instruction uses bulk async-group based completion mechanism.

Each reduction operation performed by cp.reduce.async.bulk.tensor has individually\n.relaxed.gpu memory ordering semantics. The load operations in cp.reduce.async.bulk.tensor\nare treated as weak memory operations and the complete-tx\noperation on the mbarrier has .release semantics at the .cluster scope as described in the\nMemory Consistency Model.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group\n                                             [tensorMap0, {tc0}], [sMem0];\n\ncp.reduce.async.bulk.tensor.2d.global.shared::cta.and.bulk_group.L2::cache_hint\n                                             [tensorMap1, {tc0, tc1}], [sMem1] , policy;\n\ncp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col.bulk_group\n                                             [tensorMap2, {tc0, tc1, tc2}], [sMem2]\n

", "tooltip": "=====Data Movement and Conversion Instructions: cp.async\n\n\n\nInitiates an asynchronous copy operation from one state space to another.\n\nSyntax\n\ncp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n\n [dst], [src], cp-size{, src-size}{, cache-policy} ;\n\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n\n [dst], [src], 16{, src-size}{, cache-policy} ;\n\ncp.async.ca.shared...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk\n\n\n\nInitiates an asynchronous copy operation from one state space to another.\n\nSyntax\n\ncp.async.bulk.dst.src.completion_mechanism{.multicast}{.level::cache_hint}\n\n [dstMem], [srcMem], size, [mbar] {, ctaMask} {, cache-policy}\n\n.dst = { .shared::cluster }\n\n.src = { .global }\n\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n\n.level::cache_hint = ...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.commit_group\n\n\n\nCommits all prior initiated but uncommitted cp.async.bulk instructions into a\n\ncp.async.bulk-group.\n\nSyntax\n\ncp.async.bulk.commit_group;\n\nDescription\n\ncp.async.bulk.commit_group instruction creates a new per-thread bulk async-group and batches\n\nall prior cp{.reduce}.async.bulk.{.prefetch}{.tensor} instructions satisfying the following\n\nconditions into the new bulk async-group:\n\nThe prior cp{.reduce}.async...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.prefetch\n\n\n\nProvides a hint to the system to initiate the asynchronous prefetch of data to the cache.\n\nSyntax\n\ncp.async.bulk.prefetch.L2.src{.level::cache_hint} [srcMem], size {, cache-policy}\n\n.src = { .global }\n\n.level::cache_hint = { .L2::cache_hint }\n\nDescription\n\ncp.async.bulk.prefetch is a non-blocking instruction which may initiate an asynchronous prefetch\n\nof data from the location specifie...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor\n\n\n\nProvides a hint to the system to initiate the asynchronous prefetch of tensor data to the cache.\n\nSyntax\n\n// global -> shared::cluster:\n\ncp.async.bulk.prefetch.tensor.dim.L2.src{.load_mode}{.level::cache_hint} [tensorMap, tensorCoords]\n\n {, im2colOffsets } {, cache-policy}\n\n.src = { .global }\n\n.dim = { .1d, .2d, .3...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.tensor\n\n\n\nInitiates an asynchronous copy operation on the tensor data from one state space to another.\n\nSyntax\n\n// global -> shared::cluster:\n\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.level::cache_hint}\n\n [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colOffsets}\n\n {, ctaMask} {, cache-policy}\n\n.dst = ...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.wait_group\n\n\n\nWait for completion of bulk async-groups.\n\nSyntax\n\ncp.async.bulk.wait_group{.read} N;\n\nDescription\n\ncp.async.bulk.wait_group instruction will cause the executing thread to wait until only N or\n\nfewer of the most recent bulk async-groups are pending and all the prior bulk async-groups\n\ncommitted by the executing threads are complete. For example, when N is 0, the executing thread\n\nwaits on all the prior b...\n\n=====Data Movement and Conversion Instructions: cp.async.commit_group\n\n\n\nCommits all prior initiated but uncommitted cp.async instructions into a cp.async-group.\n\nSyntax\n\ncp.async.commit_group ;\n\nDescription\n\ncp.async.commit_group instruction creates a new cp.async-group per thread and batches all\n\nprior cp.async instructions initiated by the executing thread but not committed to any\n\ncp.async-group into the new cp.async-group. If there are no uncommitted cp.async\n\ninstructio...\n\n=====Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive\n\n\n\nMakes the mbarrier object track all prior cp.async operations initiated by the\n\nexecuting thread.\n\nSyntax\n\ncp.async.mbar ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async" }; case "createpolicy": return { "html": "For more information, visit createpolicy .

Data Movement and Conversion Instructions: createpolicy

\n\n\n

Create a cache eviction policy for the specified cache level.

Syntax

// Range-based policy\ncreatepolicy.range{.global}.level::primary_priority{.level::secondary_priority}.b64\n                                   cache-policy, [a], primary-size, total-size;\n\n// Fraction-based policy\ncreatepolicy.fractional.level::primary_priority{.level::secondary_priority}.b64\n                                   cache-policy{, fraction};\n\n// Converting the access property from CUDA APIs\ncreatepolicy.cvt.L2.b64            cache-policy, access-property;\n\n.level::primary_priority =   { .L2::evict_last, .L2::evict_normal,\n                               .L2::evict_first, .L2::evict_unchanged };\n.level::secondary_priority = { .L2::evict_first, .L2::evict_unchanged };\n

Description

The createpolicy instruction creates a cache eviction policy for the specified cache level in an\nopaque 64-bit register specified by the destination operand cache-policy. The cache eviction\npolicy specifies how cache eviction priorities are applied to global memory addresses used in memory\noperations with .level::cache_hint qualifier.

There are two types of cache eviction policies:

Range-based policy
\n
The cache eviction policy created using createpolicy.range specifies the cache eviction\nbehaviors for the following three address ranges:
\n
- [a .. a + (primary-size - 1)] referred to as primary range.
- [a + primary-size .. a + (total-size - 1)] referred to as trailing secondary range.
- [a - (total-size - primary-size) .. (a - 1)] referred to as preceding secondary range.
\n
When a range-based cache eviction policy is used in a memory operation with\n.level::cache_hint qualifier, the eviction priorities are applied as follows:
\n
- If the memory address falls in the primary range, the eviction priority specified by\n.L2::primary_priority is applied.
- If the memory address falls in any of the secondary ranges, the eviction priority specified by\n.L2::secondary_priority is applied.
- If the memory address does not fall in either of the above ranges, then the applied eviction\npriority is unspecified.
\n
The 32-bit operand primary-size specifies the size, in bytes, of the primary range. The\n32-bit operand total-size specifies the combined size, in bytes, of the address range\nincluding primary and secondary ranges. The value of primary-size must be less than or equal\nto the value of total-size. Maximum allowed value of total-size is 4GB.
\n
If .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.
\n
If no state space is specified then Generic Addressing is\nused. If the specified address does not fall within the address window of .global state space\nthen the behavior is undefined.
\n
Fraction-based policy
\n
A memory operation with .level::cache_hint qualifier can use the fraction-based cache\neviction policy to request the cache eviction priority specified by .L2:primary_priority to\nbe applied to a fraction of cache accesses specified by the 32-bit floating point operand\nfraction. The remainder of the cache accesses get the eviction priority specified by\n.L2::secondary_priority. This implies that in a memory operation that uses a fraction-based\ncache policy, the memory access has a probability specified by the operand fraction of\ngetting the cache eviction priority specified by .L2::primary_priority.
\n
The valid range of values for the operand fraction is (0.0,.., 1.0]. If the operand\nfraction is not specified, it defaults to 1.0.
\n
If .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.
\n

The access property created using the CUDA APIs can be converted into cache eviction policy by the\ninstruction createpolicy.cvt. The source operand access-property is a 64-bit opaque\nregister. Refer to CUDA programming guide for more details.

PTX ISA Notes

Introduced in PTX ISA version 7.4.

Target ISA Notes

Requires sm_80 or higher.

Examples

createpolicy.fractional.L2::evict_last.b64                      policy, 1.0;\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64  policy, 0.5;\n\ncreatepolicy.range.L2::evict_last.L2::evict_first.b64\n                                            policy, [ptr], 0x100000, 0x200000;\n\n// access-prop is created by CUDA APIs.\ncreatepolicy.cvt.L2.b64 policy, access-prop;\n

", "tooltip": "Create a cache eviction policy for the specified cache level.\n\nSyntax\n\n// Range-based policy\n\ncreatepolicy.range{.global}.level::primary_priority{.level::secondary_priority}.b64\n\n cache-policy, [a], primary-size, total-size;\n\n// Fraction-based policy\n\ncreatepolicy.fractional.level::primary_priority{.level::secondary_priority}.b64\n\n cache-policy{, fraction};\n\n// Converting the access property from CUDA APIs\n\ncreatepolicy.cvt.L2.b64 cache-policy, access-property;\n\n.level::primary_priority = { .L2::evict_last, .L2::evict_normal,\n\n .L2::evict_first, .L2::evict_unchanged };\n\n.level::secondary_priority = { .L2::evict_first, .L2::evict_unchanged };\n\nDescription\n\nThe createpolicy instruction creates a cache eviction policy for the specified cache level in an\n\nopaque 64-bit register specified by the destination operand cache-policy. The cache eviction\n\npolicy specifies how cache eviction priorities are applied to global memory addresses used in memory\n\noperations with .level::cache_hint qualifier.\n\nThere are two types of cache eviction policies:\n\nRange-based policy\n\nThe cache eviction policy created using createpolicy.range specifies the cache eviction\n\nbehaviors for the following three address ranges:\n\n[a .. a + (primary-size - 1)] referred to as primary range.\n\n[a + primary-size .. a + (total-size - 1)] referred to as trailing secondary range.\n\n[a - (total-size - primary-size) .. (a - 1)] referred to as preceding secondary range.\n\nWhen a range-based cache eviction policy is used in a memory operation with\n\n.level::cache_hint qualifier, the eviction priorities are applied as follows:\n\nIf the memory address falls in the primary range, the eviction priority specified by\n\n.L2::primary_priority is applied.\n\nIf the memory address falls in any of the secondary ranges, the eviction priority specified by\n\n.L2::secondary_priority is applied.\n\nIf the memory address does not fall in either of the above ranges, then the applied eviction\n\npriority is unspecified.\n\nThe 32-bit operand primary-size specifies the size, in bytes, of the primary range. The\n\n32-bit operand total-size specifies the combined size, in bytes, of the address range\n\nincluding primary and secondary ranges. The value of primary-size must be less than or equal\n\nto the value of total-size. Maximum allowed value of total-size is 4GB.\n\nIf .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nFraction-based policy\n\nA memory operation with .level::cache_hint qualifier can use the fraction-based cache\n\neviction policy to request the cache eviction priority specified by .L2:primary_priority to\n\nbe applied to a fraction of cache accesses specified by the 32-bit floating point operand\n\nfraction. The remainder of the cache accesses get the eviction priority specified by\n\n.L2::secondary_priority. This implies that in a memory operation that uses a fraction-based\n\ncache policy, the memory access has a probability specified by the operand fraction of\n\ngetting the cache eviction priority specified by .L2::primary_priority.\n\nThe valid range of values for the operand fraction is (0.0,.., 1.0]. If the operand\n\nfraction is not specified, it defaults to 1.0.\n\nIf .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.\n\nThe access property created using the CUDA APIs can be converted into cache eviction policy by the\n\ninstruction createpolicy.cvt. The source operand access-property is a 64-bit opaque\n\nregister. Refer to CUDA programming guide for more details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\ncreatepolicy.fractional.L2::evict_last.b64 policy, 1.0;\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 polic ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy" }; case "ctaid": return { "html": "For more information, visit ctaid .

Special Registers: %ctaid

\n\n\n

CTA identifier within a grid.

Syntax (predefined)

.sreg .v4 .u32 %ctaid;                      // CTA id vector\n.sreg .u32 %ctaid.x, %ctaid.y, %ctaid.z;    // CTA id components\n

Description

A predefined, read-only special register initialized with the CTA identifier within the CTA\ngrid. The %ctaid special register contains a 1D, 2D, or 3D vector, depending on the shape and\nrank of the CTA grid. The fourth element is unused and always returns zero.

It is guaranteed that:

0  <=  %ctaid.x <  %nctaid.x\n0  <=  %ctaid.y <  %nctaid.y\n0  <=  %ctaid.z <  %nctaid.z\n

PTX ISA Notes

Introduced in PTX ISA version 1.0 with type .v4.u16.

Redefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n%ctaid.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u32  %r0,%ctaid.x;\nmov.u16  %rh,%ctaid.y;   // legacy code\n

", "tooltip": "CTA identifier within a grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %ctaid; // CTA id vector\n\n.sreg .u32 %ctaid.x, %ctaid.y, %ctaid.z; // CTA id components\n\nDescription\n\nA predefined, read-only special register initialized with the CTA identifier within the CTA\n\ngrid. The %ctaid special register contains a 1D, 2D, or 3D vector, depending on the shape and\n\nrank of the CTA grid. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0 <= %ctaid.x < %nctaid.x\n\n0 <= %ctaid.y < %nctaid.y\n\n0 <= %ctaid.z < %nctaid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%ctaid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32 %r0,%ctaid.x;\n\nmov.u16 %rh,%ctaid.y; // legacy code\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ctaid" }; case "current_graph_exec": return { "html": "For more information, visit current_graph_exec .

Special Registers: %current_graph_exec

\n\n\n

An Identifier for currently executing CUDA device graph.

Syntax (predefined)

.sreg .u64 %current_graph_exec;\n

Description

A predefined, read-only special register initialized with the identifier referring to the CUDA\ndevice graph being currently executed. This register is 0 if the executing kernel is not part of a\nCUDA device graph.

Refer to the CUDA Programming Guide for more details on CUDA device graphs.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_50 or higher.

Examples

mov.u64  r1, %current_graph_exec;\n

", "tooltip": "An Identifier for currently executing CUDA device graph.\n\nSyntax (predefined)\n\n.sreg .u64 %current_graph_exec;\n\nDescription\n\nA predefined, read-only special register initialized with the identifier referring to the CUDA\n\ndevice graph being currently executed. This register is 0 if the executing kernel is not part of a\n\nCUDA device graph.\n\nRefer to the CUDA Programming Guide for more details on CUDA device graphs.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_50 or higher.\n\nExamples\n\nmov.u64 r1, %current_graph_exec;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-current-graph-exec" }; case "cvt": return { "html": "For more information, visit cvt , cvt.pack .

Data Movement and Conversion Instructions: cvt

\n\n\n

Convert a value from one type to another.

Syntax

cvt{.irnd}{.ftz}{.sat}.dtype.atype         d, a;  // integer rounding\ncvt{.frnd}{.ftz}{.sat}.dtype.atype         d, a;  // fp rounding\ncvt.frnd2{.relu}{.satfinite}.f16.f32       d, a;\ncvt.frnd2{.relu}{.satfinite}.f16x2.f32     d, a, b;\ncvt.frnd2{.relu}{.satfinite}.bf16.f32      d, a;\ncvt.frnd2{.relu}{.satfinite}.bf16x2.f32    d, a, b;\ncvt.rna{.satfinite}.tf32.f32               d, a;\ncvt.frnd2{.relu}.tf32.f32                  d, a;\ncvt.rn.satfinite{.relu}.f8x2type.f32       d, a, b;\ncvt.rn.satfinite{.relu}.f8x2type.f16x2     d, a;\ncvt.rn.{.relu}.f16x2.f8x2type              d, a;\n\n.irnd   = { .rni, .rzi, .rmi, .rpi };\n.frnd   = { .rn,  .rz,  .rm,  .rp  };\n.frnd2  = { .rn,  .rz };\n.dtype = .atype = { .u8,   .u16, .u32, .u64,\n                    .s8,   .s16, .s32, .s64,\n                    .bf16, .f16, .f32, .f64 };\n.f8x2type = { .e4m3x2, .e5m2x2 };\n

Description

Convert between different types and sizes.

For .f16x2 and .bf16x2 instruction type, two inputs a and b of .f32 type are\nconverted into .f16 or .bf16 type and the converted values are packed in the destination\nregister d, such that the value converted from input a is stored in the upper half of d\nand the value converted from input b is stored in the lower half of d

For .f16x2 instruction type, destination operand d has .f16x2 or .b32 type. For\n.bf16 instruction type, operand d has .b16 type. For .bf16x2 instruction type,\noperand d has .b32 type. For .tf32 instruction type, operand d has .b32 type.

When converting to .e4m3x2/.e5m2x2 data formats, the destination operand d has .b16\ntype. When converting two .f32 inputs to .e4m3x2/.e5m2x2, each input is converted to the\nspecified format, and the converted values are packed in the destination operand d such that the\nvalue converted from input a is stored in the upper 8 bits of d and the value converted from\ninput b is stored in the lower 8 bits of d. When converting an .f16x2 input to\n.e4m3x2/ .e5m2x2, each .f16 input from operand a is converted to the specified\nformat. The converted values are packed in the destination operand d such that the value\nconverted from the upper 16 bits of input a is stored in the upper 8 bits of d and the value\nconverted from the lower 16 bits of input a is stored in the lower 8 bits of d.

When converting from .e4m3x2/.e5m2x2 to .f16x2, source operand a has .b16\ntype. Each 8-bit input value in operand a is converted to .f16 type. The converted values\nare packed in the destination operand d such that the value converted from the upper 8 bits of\na is stored in the upper 16 bits of d and the value converted from the lower 8 bits of a\nis stored in the lower 16 bits of d.

Rounding modifier is mandatory in all of the following cases:

float-to-float conversions, when destination type is smaller than source type
All float-to-int conversions
All int-to-float conversions
All conversions involving .f16x2, .e4m3x2, .e5m2x2,.bf16x2 and .tf32 instruction\ntypes.

.satfinite modifier is only supported for conversions involving the following types:

.e4m3x2 and .e5m2x2 destination types. .satfinite modifier is mandatory for such\nconversions.
.f16, .bf16, .f16x2, .bf16x2 as destination types.
.tf32 as destination type with rounding mode specified as round to nearest, ties away from\nzero.

Semantics

if (/* inst type is .f16x2 or .bf16x2 */) {\n    d[31:16] = convert(a);\n    d[15:0]  = convert(b);\n} else {\n    d = convert(a);\n}\n

Integer Notes

Integer rounding is required for float-to-integer conversions, and for same-size float-to-float\nconversions where the value is rounded to an integer. Integer rounding is illegal in all other\ninstances.

Integer rounding modifiers:

.rni: round to nearest integer, choosing even integer if source is equidistant between two integers
\n
.rzi: round to nearest integer in the direction of zero
\n
.rmi: round to nearest integer in direction of negative infinity
\n
.rpi: round to nearest integer in direction of positive infinity
\n

In float-to-integer conversion, NaN inputs are converted to 0.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

For cvt.ftz.dtype.f32 float-to-integer conversions and cvt.ftz.f32.f32 float-to-float\nconversions with integer rounding, subnormal inputs are flushed to sign-preserving zero. Modifier\n.ftz can only be specified when either .dtype or .atype is .f32 and applies only\nto single precision (.f32) inputs and results.

sm_1x

For cvt.ftz.dtype.f32 float-to-integer conversions and cvt.ftz.f32.f32\nfloat-to-float conversions with integer rounding, subnormal inputs are flushed to sign-preserving\nzero. The optional .ftz modifier may be specified in these cases for clarity.

Note: In PTX ISA versions 1.4 and earlier, the cvt instruction did not flush single-precision\nsubnormal inputs or results to zero if the destination type size was 64-bits. The compiler will\npreserve this behavior for legacy PTX code.

Saturation modifier:

.sat

For integer destination types, .sat limits the result to MININT..MAXINT for the size of\nthe operation. Note that saturation applies to both signed and unsigned integer types.

The saturation modifier is allowed only in cases where the destination type\u2019s value range is not\na superset of the source type\u2019s value range; i.e., the .sat modifier is illegal in cases\nwhere saturation is not possible based on the source and destination types.

For float-to-integer conversions, the result is clamped to the destination range by default; i.e,\n.sat is redundant.

Floating Point Notes

Floating-point rounding is required for float-to-float conversions that result in loss of precision,\nand for integer-to-float conversions. Floating-point rounding is illegal in all other instances.

Floating-point rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n
.rna: mantissa LSB rounds to nearest, ties away from zero
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

A floating-point value may be rounded to an integral value using the integer rounding modifiers (see\nInteger Notes). The operands must be of the same size. The result is an integral value, stored in\nfloating-point format.

Subnormal numbers:

sm_20+: By default, subnormal numbers are supported. Modifier .ftz may be specified to flush\nsingle-precision subnormal inputs and results to sign-preserving zero. Modifier .ftz can only\nbe specified when either .dtype or .atype is .f32 and applies only to single\nprecision (.f32) inputs and results.
\n
sm_1x: Single-precision subnormal inputs and results are flushed to sign-preserving zero. The optional\n.ftz modifier may be specified in these cases for clarity.
\n

Note: In PTX ISA versions 1.4 and earlier, the cvt instruction did not flush\nsingle-precision subnormal inputs or results to zero if either source or destination type was\n.f64. The compiler will preserve this behavior for legacy PTX code. Specifically, if the PTX\nISA version is 1.4 or earlier, single-precision subnormal inputs and results are flushed to\nsign-preserving zero only for cvt.f32.f16, cvt.f16.f32, and cvt.f32.f32 instructions.

Saturation modifier:

.sat:: For floating-point destination types, .sat limits the result to the range [0.0, 1.0]. NaN\nresults are flushed to positive zero. Applies to .f16, .f32, and .f64 types.
\n
.relu:: For .f16, .f16x2, .bf16, .bf16x2, .e4m3x2, .e5m2x2 and .tf32\ndestination types, .relu clamps the result to 0 if negative. NaN results are converted to\ncanonical NaN.
\n
.satfinite:: For .f16, .f16x2, .bf16, .bf16x2, .e4m3x2, .e5m2x2 and .tf32\ndestination formats, if the input value is NaN, then the result is NaN in the specified\ndestination format. If the absolute value of input (ignoring sign) is greater than MAX_NORM of\nthe specified destination format, then the result is sign-preserved MAX_NORM of the destination\nformat.
\n

Notes

A source register wider than the specified type may be used, except when the source operand has\n.bf16 or .bf16x2 format. The lower n bits corresponding to the instruction-type width\nare used in the conversion. See Operand Size Exceeding Instruction-Type Size for a description of these relaxed\ntype-checking rules.

A destination register wider than the specified type may be used, except when the destination\noperand has .bf16, .bf16x2 or .tf32 format. The result of conversion is sign-extended to\nthe destination register width for signed integers, and is zero-extended to the destination register\nwidth for unsigned, bit-size, and floating-point types. See Operand Size Exceeding Instruction-Type\nSize for a description of these relaxed\ntype-checking rules.

For cvt.f32.bf16, NaN input yields unspecified NaN.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

.relu modifier and {.f16x2, .bf16, .bf16x2, .tf32} destination formats\nintroduced in PTX ISA version 7.0.

cvt.bf16.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64/bf16},\ncvt.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64}.bf16, and cvt.tf32.f32.{relu}.{rn/rz} introduced\nin PTX ISA 7.8.

cvt with .e4m3x2/.e5m2x2 for sm_90 or higher introduced in PTX ISA version 7.8.

cvt.satfinite.{e4m3x2, e5m2x2}.{f32, f16x2} for sm_90 or higher introduced in PTX ISA version 7.8.

cvt with .e4m3x2/.e5m2x2 for sm_89 introduced in PTX ISA version 8.1.

cvt.satfinite.{e4m3x2, e5m2x2}.{f32, f16x2} for sm_89 introduced in PTX ISA version 8.1.

cvt.satfinite.{f16, bf16, f16x2, bf16x2, tf32}.f32 introduced in PTX ISA version 8.1.

Target ISA Notes

cvt to or from .f64 requires sm_13 or higher.

.relu modifier and {.f16x2, .bf16, .bf16x2, .tf32} destination formats require\nsm_80 or higher.

cvt.bf16.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64/bf16},\ncvt.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64}.bf16, and cvt.tf32.f32.{relu}.{rn/rz} require\nsm_90 or higher.

cvt with .e4m3x2/.e5m2x2 requires sm89 or higher.

cvt.satfinite.{e4m3x2, e5m2x2}.{f32, f16x2} requires sm_89 or higher.

Examples

cvt.f32.s32 f,i;\ncvt.s32.f64 j,r;     // float-to-int saturates by default\ncvt.rni.f32.f32 x,y; // round to nearest int, result is fp\ncvt.f32.f32 x,y;     // note .ftz behavior for sm_1x targets\ncvt.rn.relu.f16.f32      b, f;        // result is saturated with .relu saturation mode\ncvt.rz.f16x2.f32         b1, f, f1;   // convert two fp32 values to packed fp16 outputs\ncvt.rn.relu.satfinite.f16x2.f32    b1, f, f1;   // convert two fp32 values to packed fp16 outputs with .relu saturation on each output\ncvt.rn.bf16.f32          b, f;        // convert fp32 to bf16\ncvt.rz.relu.satfinite.bf16.f3 2    b, f;        // convert fp32 to bf16 with .relu and .satfinite saturation\ncvt.rz.satfinite.bf16x2.f32        b1, f, f1;   // convert two fp32 values to packed bf16 outputs\ncvt.rn.relu.bf16x2.f32   b1, f, f1;   // convert two fp32 values to packed bf16 outputs with .relu saturation on each output\ncvt.rna.satfinite.tf32.f32         b1, f;       // convert fp32 to tf32 format\ncvt.rn.relu.tf32.f32     d, a;        // convert fp32 to tf32 format\ncvt.f64.bf16.rp          f, b;        // convert bf16 to f64 format\ncvt.bf16.f16.rz          b, f         // convert f16 to bf16 format\ncvt.bf16.u64.rz          b, u         // convert u64 to bf16 format\ncvt.s8.bf16.rpi          s, b         // convert bf16 to s8 format\ncvt.bf16.bf16.rpi        b1, b2       // convert bf16 to corresponding int represented in bf16 format\ncvt.rn.satfinite.e4m3x2.f32 d, a, b;  // convert a, b to .e4m3 and pack as .e4m3x2 output\ncvt.rn.relu.satfinite.e5m2x2.f16x2 d, a; // unpack a and convert the values to .e5m2 outputs with .relu\n                                         // saturation on each output and pack as .e5m2x2\ncvt.rn.f16x2.e4m3x2 d, a;             // unpack a, convert two .e4m3 values to packed f16x2 output\n

Data Movement and Conversion Instructions: cvt.pack

\n\n\n

Convert two integer values from one integer type to another and pack the results.

Syntax

cvt.pack.sat.convertType.abType  d, a, b;\n    .convertType  = { .u16, .s16 }\n    .abType       = { .s32 }\n\ncvt.pack.sat.convertType.abType.cType  d, a, b, c;\n    .convertType  = { .u2, .s2, .u4, .s4, .u8, .s8 }\n    .abType       = { .s32 }\n    .cType        = { .b32 }\n

Description

Convert two 32-bit integers a and b into specified type and pack the results into d.

Destination d is an unsigned 32-bit integer. Source operands a and b are integers of\ntype .abType and the source operand c is an integer of type .cType.

The inputs a and b are converted to values of type specified by .convertType with\nsaturation and the results after conversion are packed into lower bits of d.

If operand c is specified then remaining bits of d are copied from lower bits of c.

Semantics

ta = a < MIN(convertType) ? MIN(convertType) : a;\nta = a > MAX(convertType) ? MAX(convertType) : a;\ntb = b < MIN(convertType) ? MIN(convertType) : b;\ntb = b > MAX(convertType) ? MAX(convertType) : b;\n\nsize = sizeInBits(convertType);\ntd = tb ;\nfor (i = size; i <= 2 * size - 1; i++) {\n    td[i] = ta[i - size];\n}\n\nif (isU16(convertType) || isS16(convertType)) {\n    d = td;\n} else {\n    for (i = 0; i < 2 * size; i++) {\n        d[i] = td[i];\n    }\n    for (i = 2 * size; i <= 31; i++) {\n        d[i] = c[i - 2 * size];\n    }\n}\n

.sat modifier limits the converted values to MIN(convertType)..MAX(convertedType) (no\noverflow) if the corresponding inputs are not in the range of datatype specified as\n.convertType.

PTX ISA Notes

Introduced in PTX ISA version 6.5.

Target ISA Notes

Requires sm_72 or higher.

Sub byte types (.u4/.s4 and .u2/.s2) requires sm_75 or higher.

Examples

cvt.pack.sat.s16.s32      %r1, %r2, %r3;           // 32-bit to 16-bit conversion\ncvt.pack.sat.u8.s32.b32   %r4, %r5, %r6, 0;        // 32-bit to 8-bit conversion\ncvt.pack.sat.u8.s32.b32   %r7, %r8, %r9, %r4;      // %r7 = { %r5, %r6, %r8, %r9 }\ncvt.pack.sat.u4.s32.b32   %r10, %r12, %r13, %r14;  // 32-bit to 4-bit conversion\ncvt.pack.sat.s2.s32.b32   %r15, %r16, %r17, %r18;  // 32-bits to 2-bit conversion\n

", "tooltip": "=====Data Movement and Conversion Instructions: cvt\n\n\n\nConvert a value from one type to another.\n\nSyntax\n\ncvt{.irnd}{.ftz}{.sat}.dtype.atype d, a; // integer rounding\n\ncvt{.frnd}{.ftz}{.sat}.dtype.atype d, a; // fp rounding\n\ncvt.frnd2{.relu}{.satfinite}.f16.f32 d, a;\n\ncvt.frnd2{.relu}{.satfinite}.f16x2.f32 d, a, b;\n\ncvt.frnd2{.relu}{.satfinite}.bf16.f32 d, a;\n\ncvt.frnd2{.relu}{.satfinite}.bf16x2.f32 d, a, b;\n\ncvt.rna{.satfi...\n\n=====Data Movement and Conversion Instructions: cvt.pack\n\n\n\nConvert two integer values from one integer type to another and pack the results.\n\nSyntax\n\ncvt.pack.sat.convertType.abType d, a, b;\n\n .convertType = { .u16, .s16 }\n\n .abType = { .s32 }\n\ncvt.pack.sat.convertType.abType.cType d, a, b, c;\n\n .convertType = { .u2, .s2, .u4, .s4, .u8, .s8 }\n\n .abType = { .s32 }\n\n .cType = { .b32 }\n\nDescription\n\nConvert two 32-bit integers a a... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt" }; case "cvta": return { "html": "For more information, visit cvta .

Data Movement and Conversion Instructions: cvta

\n\n\n

Convert address from .const, Kernel Function Parameters (.param), .global, .local, or .shared\nstate space to generic, or vice-versa. Take the generic address of a variable declared in\n.const, Kernel Function Parameters (.param),\n.global, .local, or .shared state space.

Syntax

// convert const, global, local, or shared address to generic address\ncvta.space.size  p, a;        // source address in register a\ncvta.space.size  p, var;      // get generic address of var\ncvta.space.size  p, var+imm;  // generic address of var+offset\n\n// convert generic address to const, global, local, or shared address\ncvta.to.space.size  p, a;\n\n.space = { .const, .global, .local, .shared{::cta, ::cluster}, .param };\n.size  = { .u32, .u64 };\n

Description

Convert a const, Kernel Function Parameters\n(.param), global, local, or shared address to a generic address, or vice-versa. The\nsource and destination addresses must be the same size. Use cvt.u32.u64 or cvt.u64.u32 to\ntruncate or zero-extend addresses.

For variables declared in .const, Kernel Function Parameters (.param), .global, .local, or .shared\nstate space, the generic address of the variable may be taken using cvta. The source is either a\nregister or a variable defined in const, Kernel Function Parameters (.param), global, local, or shared memory\nwith an optional offset.

When converting a generic address into a const, Kernel Function Parameters (.param), global, local, or shared\naddress, the resulting address is undefined in cases where the generic address does not fall within\nthe address window of the specified state space. A program may use isspacep to guard against\nsuch incorrect behavior.

For cvta with .shared state space, the address must belong to the space specified by\n::cta or ::cluster sub-qualifier, otherwise the behavior is undefined. If no sub-qualifier\nis specified with .shared state space, then ::cta is assumed by default.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

cvta.const and cvta.to.const introduced in PTX ISA version 3.1.

cvta.param and cvta.to.param introduced in PTX ISA version 7.7.

Note: The current implementation does not allow generic pointers to const space variables in\nprograms that contain pointers to constant buffers passed as kernel parameters.

Support for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.

Target ISA Notes

cvta requires sm_20 or higher.

cvta.param and cvta.to.param requires sm_70 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Examples

cvta.const.u32   ptr,cvar;\ncvta.local.u32   ptr,lptr;\ncvta.shared::cta.u32  p,As+4;\ncvta.shared::cluster.u32 ptr, As;\ncvta.to.global.u32  p,gptr;\ncvta.param.u64   ptr,pvar;\n

", "tooltip": "Convert address from .const, Kernel Function Parameters (.param), .global, .local, or .shared\n\nstate space to generic, or vice-versa. Take the generic address of a variable declared in\n\n.const, Kernel Function Parameters (.param),\n\n.global, .local, or .shared state space.\n\nSyntax\n\n// convert const, global, local, or shared address to generic address\n\ncvta.space.size p, a; // source address in register a\n\ncvta.space.size p, var; // get generic address of var\n\ncvta.space.size p, var+imm; // generic address of var+offset\n\n// convert generic address to const, global, local, or shared address\n\ncvta.to.space.size p, a;\n\n.space = { .const, .global, .local, .shared{::cta, ::cluster}, .param };\n\n.size = { .u32, .u64 };\n\nDescription\n\nConvert a const, Kernel Function Parameters\n\n(.param), global, local, or shared address to a generic address, or vice-versa. The\n\nsource and destination addresses must be the same size. Use cvt.u32.u64 or cvt.u64.u32 to\n\ntruncate or zero-extend addresses.\n\nFor variables declared in .const, Kernel Function Parameters (.param), .global, .local, or .shared\n\nstate space, the generic address of the variable may be taken using cvta. The source is either a\n\nregister or a variable defined in const, Kernel Function Parameters (.param), global, local, or shared memory\n\nwith an optional offset.\n\nWhen converting a generic address into a const, Kernel Function Parameters (.param), global, local, or shared\n\naddress, the resulting address is undefined in cases where the generic address does not fall within\n\nthe address window of the specified state space. A program may use isspacep to guard against\n\nsuch incorrect behavior.\n\nFor cvta with .shared state space, the address must belong to the space specified by\n\n::cta or ::cluster sub-qualifier, otherwise the behavior is undefined. If no sub-qualifier\n\nis specified with .shared state space, then ::cta is assumed by default.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\ncvta.const and cvta.to.const introduced in PTX ISA version 3.1.\n\ncvta.param and cvta.to.param introduced in PTX ISA version 7.7.\n\nNote: The current implementation does not allow generic pointers to const space variables in\n\nprograms that contain pointers to constant buffers passed as kernel parameters.\n\nSupport for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\ncvta requires sm_20 or higher.\n\ncvta.param and cvta.to.param requires sm_70 or higher.\n\nSub-qualifier ::cta requires sm_30 or higher.\n\nSub-qualifier ::cluster requires sm_90 or higher.\n\nExamples\n\ncvta.const.u32 ptr,cvar;\n\ncvta.local.u32 ptr,lptr;\n\ncvta.shared::cta.u32 p,As+4;\n\ncvta.shared::cluster.u32 ptr, As;\n\ncvta.to.global.u32 p,gptr;\n\ncvta.param.u64 ptr,pvar;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta" }; case "discard": return { "html": "For more information, visit discard .

Data Movement and Conversion Instructions: discard

\n\n\n

Invalidate the data in cache at the specified address and cache level.

Syntax

discard{.global}.level  [a], size;\n\n.level = { .L2 };\n

Description

The discard instruction invalidates the data at the address range [a .. a + (size - 1)] in\nthe cache level specified by the .level qualifier without writing back the data in the cache to\nthe memory. Therefore after the discard operation, the data at the address range [a .. a+ (size -\n1)] has undetermined value.

The operand size is an integer constant that specifies the amount of data, in bytes, in the\ncache level specified by the .level qualifier to be discarded. The only supported value for the\nsize operand is 128.

If no state space is specified then Generic Addressing is\nused. If the specified address does not fall within the address window of .global state space\nthen the behavior is undefined.

Supported addressing modes for address operand a are described in Addresses as Operands. a must be aligned to 128 bytes.

PTX ISA Notes

Introduced in PTX ISA version 7.4.

Target ISA Notes

Requires sm_80 or higher.

Examples

discard.global.L2 [ptr], 128;\n

", "tooltip": "Invalidate the data in cache at the specified address and cache level.\n\nSyntax\n\ndiscard{.global}.level [a], size;\n\n.level = { .L2 };\n\nDescription\n\nThe discard instruction invalidates the data at the address range [a .. a + (size - 1)] in\n\nthe cache level specified by the .level qualifier without writing back the data in the cache to\n\nthe memory. Therefore after the discard operation, the data at the address range [a .. a+ (size -\n\n1)] has undetermined value.\n\nThe operand size is an integer constant that specifies the amount of data, in bytes, in the\n\ncache level specified by the .level qualifier to be discarded. The only supported value for the\n\nsize operand is 128.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nSupported addressing modes for address operand a are described in Addresses as Operands. a must be aligned to 128 bytes.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\ndiscard.global.L2 [ptr], 128;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard" }; case "div": return { "html": "For more information, visit div(fp) , div(int) .

Floating Point Instructions: div

\n\n\n

Divide one value by another.

Syntax

div.approx{.ftz}.f32  d, a, b;  // fast, approximate divide\ndiv.full{.ftz}.f32    d, a, b;  // full-range approximate divide\ndiv.rnd{.ftz}.f32     d, a, b;  // IEEE 754 compliant rounding\ndiv.rnd.f64           d, a, b;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Divides a by b, stores result in d.

Semantics

d = a / b;\n

Notes

Fast, approximate single-precision divides:

div.approx.f32 implements a fast approximation to divide, computed as d = a * (1/b). For\n|b| in [2^-126, 2¹²⁶], the maximum ulp error is 2. For 2¹²⁶ <\n|b| < 2¹²⁸, if a is infinity, div.approx.f32 returns NaN, otherwise it\nreturns 0.
div.full.f32 implements a relatively fast, full-range approximation that scales operands to\nachieve better accuracy, but is not fully IEEE 754 compliant and does not support rounding\nmodifiers. The maximum ulp error is 2 across the full range of inputs.
Subnormal inputs and results are flushed to sign-preserving zero. Fast, approximate division by\nzero creates a value of infinity (with same sign as a).

Divide with IEEE 754 compliant rounding:

Rounding modifiers (no default):

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

div.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

div.f64 supports subnormal numbers.

div.f32 flushes subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

div.f32 and div.f64 introduced in PTX ISA version 1.0.

Explicit modifiers .approx, .full, .ftz, and rounding introduced in PTX ISA version 1.4.

For PTX ISA version 1.4 and later, one of .approx, .full, or .rnd is required.

For PTX ISA versions 1.0 through 1.3, div.f32 defaults to div.approx.ftz.f32, and\ndiv.f64 defaults to div.rn.f64.

Target ISA Notes

div.approx.f32 and div.full.f32 supported on all target architectures.

div.rnd.f32 requires sm_20 or higher.

div.rn.f64 requires sm_13 or higher, or .target map_f64_to_f32.

div.{rz,rm,rp}.f64 requires sm_20 or higher.

Examples

div.approx.ftz.f32  diam,circum,3.14159;\ndiv.full.ftz.f32    x, y, z;\ndiv.rn.f64          xd, yd, zd;\n

Integer Arithmetic Instructions: div

\n\n\n

Divide one value by another.

Syntax

div.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Divides a by b, stores result in d.

Semantics

d = a / b;\n

Notes

Division by zero yields an unspecified, machine-specific value.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

div.s32  b,n,i;\n

", "tooltip": "=====Floating Point Instructions: div\n\n\n\nDivide one value by another.\n\nSyntax\n\ndiv.approx{.ftz}.f32 d, a, b; // fast, approximate divide\n\ndiv.full{.ftz}.f32 d, a, b; // full-range approximate divide\n\ndiv.rnd{.ftz}.f32 d, a, b; // IEEE 754 compliant rounding\n\ndiv.rnd.f64 d, a, b; // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nDivides a by b, stores result in d.\n\nSemantics\n\nd = a / b;\n\nNotes\n\nFast, a...\n\n=====Integer Arithmetic Instructions: div\n\n\n\nDivide one value by another.\n\nSyntax\n\ndiv.type d, a, b;\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nDivides a by b, stores result in d.\n\nSemantics\n\nd = a / b;\n\nNotes\n\nDivision by zero yields an unspecified, machine-specific value.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ndiv.s32 b,n,i;\n\n... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div" }; case "dp2a": return { "html": "For more information, visit dp2a(int) .

Integer Arithmetic Instructions: dp2a

\n\n\n

Two-way dot product-accumulate.

Syntax

dp2a.mode.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n.mode = { .lo, .hi };\n

Description

Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit result.

Operand a and b are 32-bit inputs. Operand a holds two 16-bits inputs in packed form and\noperand b holds 4 byte inputs in packed form for dot product.

Depending on the .mode specified, either lower half or upper half of operand b will be used\nfor dot product.

Operand c has type .u32 if both .atype and .btype are .u32 else operand c\nhas type .s32.

Semantics

d = c;\n// Extract two 16-bit values from a 32-bit input and sign or zero extend\n// based on input type.\nVa = extractAndSignOrZeroExt_2(a, .atype);\n\n// Extract four 8-bit values from a 32-bit input and sign or zer extend\n// based on input type.\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nb_select = (.mode == .lo) ? 0 : 2;\n\nfor (i = 0; i < 2; ++i) {\n    d += Va[i] * Vb[b_select + i];\n}\n

PTX ISA Notes

Introduced in PTX ISA version 5.0.

Target ISA Notes

Requires sm_61 or higher.

Examples

dp2a.lo.u32.u32           d0, a0, b0, c0;\ndp2a.hi.u32.s32           d1, a1, b1, c1;\n

", "tooltip": "Two-way dot product-accumulate.\n\nSyntax\n\ndp2a.mode.atype.btype d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.mode = { .lo, .hi };\n\nDescription\n\nTwo-way 16-bit to 8-bit dot product which is accumulated in 32-bit result.\n\nOperand a and b are 32-bit inputs. Operand a holds two 16-bits inputs in packed form and\n\noperand b holds 4 byte inputs in packed form for dot product.\n\nDepending on the .mode specified, either lower half or upper half of operand b will be used\n\nfor dot product.\n\nOperand c has type .u32 if both .atype and .btype are .u32 else operand c\n\nhas type .s32.\n\nSemantics\n\nd = c;\n\n// Extract two 16-bit values from a 32-bit input and sign or zero extend\n\n// based on input type.\n\nVa = extractAndSignOrZeroExt_2(a, .atype);\n\n// Extract four 8-bit values from a 32-bit input and sign or zer extend\n\n// based on input type.\n\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nb_select = (.mode == .lo) ? 0 : 2;\n\nfor (i = 0; i < 2; ++i) {\n\n d += Va[i] * Vb[b_select + i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\nRequires sm_61 or higher.\n\nExamples\n\ndp2a.lo.u32.u32 d0, a0, b0, c0;\n\ndp2a.hi.u32.s32 d1, a1, b1, c1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a" }; case "dp4a": return { "html": "For more information, visit dp4a(int) .

Integer Arithmetic Instructions: dp4a

\n\n\n

Four-way byte dot product-accumulate.

Syntax

dp4a.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n

Description

Four-way byte dot product which is accumulated in 32-bit result.

Operand a and b are 32-bit inputs which hold 4 byte inputs in packed form for dot product.

Operand c has type .u32 if both .atype and .btype are .u32 else operand c\nhas type .s32.

Semantics

d = c;\n\n// Extract 4 bytes from a 32bit input and sign or zero extend\n// based on input type.\nVa = extractAndSignOrZeroExt_4(a, .atype);\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nfor (i = 0; i < 4; ++i) {\n    d += Va[i] * Vb[i];\n}\n

PTX ISA Notes

Introduced in PTX ISA version 5.0.

Target ISA Notes

Requires sm_61 or higher.

Examples

dp4a.u32.u32           d0, a0, b0, c0;\ndp4a.u32.s32           d1, a1, b1, c1;\n

", "tooltip": "Four-way byte dot product-accumulate.\n\nSyntax\n\ndp4a.atype.btype d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n\nDescription\n\nFour-way byte dot product which is accumulated in 32-bit result.\n\nOperand a and b are 32-bit inputs which hold 4 byte inputs in packed form for dot product.\n\nOperand c has type .u32 if both .atype and .btype are .u32 else operand c\n\nhas type .s32.\n\nSemantics\n\nd = c;\n\n// Extract 4 bytes from a 32bit input and sign or zero extend\n\n// based on input type.\n\nVa = extractAndSignOrZeroExt_4(a, .atype);\n\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nfor (i = 0; i < 4; ++i) {\n\n d += Va[i] * Vb[i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\nRequires sm_61 or higher.\n\nExamples\n\ndp4a.u32.u32 d0, a0, b0, c0;\n\ndp4a.u32.s32 d1, a1, b1, c1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a" }; case "dynamic_smem_size": return { "html": "For more information, visit dynamic_smem_size .

Special Registers: %dynamic_smem_size

\n\n\n

Size of shared memory allocated dynamically at kernel launch.

Syntax (predefined)

.sreg .u32 %dynamic_smem_size;\n

Description

Size of shared memory allocated dynamically at kernel launch.

A predefined, read-only special register initialized with size of shared memory allocated dynamically for the CTA of a kernel at launch time.

PTX ISA Notes

Introduced in PTX ISA version 4.1.

Target ISA Notes

Requires sm_20 or higher.

Examples

mov.u32  %r, %dynamic_smem_size;\n

", "tooltip": "Size of shared memory allocated dynamically at kernel launch.\n\nSyntax (predefined)\n\n.sreg .u32 %dynamic_smem_size;\n\nDescription\n\nSize of shared memory allocated dynamically at kernel launch.\n\nA predefined, read-only special register initialized with size of shared memory allocated dynamically for the CTA of a kernel at launch time.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %dynamic_smem_size;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-dynamic-smem-size" }; case "elect": return { "html": "For more information, visit elect.sync .

Parallel Synchronization and Communication Instructions: elect.sync

\n\n\n

Elect a leader thread from a set of threads.

Syntax

elect.sync d|p, membermask;\n

Description

elect.sync elects one predicated active leader thread from among a set of threads specified by\nmembermask. laneid of the elected thread is returned in the 32-bit destination operand\nd. The sink symbol \u2018_\u2019 can be used for destination operand d. The predicate destination\np is set to True for the leader thread, and False for all other threads.

Operand membermask specifies a 32-bit integer indicating the set of threads from which a leader\nis to be elected. The behavior is undefined if the executing thread is not in membermask.

Election of a leader thread happens deterministically, i.e. the same leader thread is elected for\nthe same membermask every time.

The mandatory .sync qualifier indicates that elect causes the executing thread to wait until\nall threads in the membermask execute the elect instruction before resuming execution.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

elect.sync    %r0|%p0, 0xffffffff;\n

", "tooltip": "Elect a leader thread from a set of threads.\n\nSyntax\n\nelect.sync d|p, membermask;\n\nDescription\n\nelect.sync elects one predicated active leader thread from among a set of threads specified by\n\nmembermask. laneid of the elected thread is returned in the 32-bit destination operand\n\nd. The sink symbol \u2018_\u2019 can be used for destination operand d. The predicate destination\n\np is set to True for the leader thread, and False for all other threads.\n\nOperand membermask specifies a 32-bit integer indicating the set of threads from which a leader\n\nis to be elected. The behavior is undefined if the executing thread is not in membermask.\n\nElection of a leader thread happens deterministically, i.e. the same leader thread is elected for\n\nthe same membermask every time.\n\nThe mandatory .sync qualifier indicates that elect causes the executing thread to wait until\n\nall threads in the membermask execute the elect instruction before resuming execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nelect.sync %r0|%p0, 0xffffffff;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync" }; case "entry": return { "html": "For more information, visit entry .

Kernel and Function Directives: .entry

\n\n\n

Kernel entry point and body, with optional parameters.

Syntax

.entry kernel-name ( param-list )  kernel-body\n.entry kernel-name  kernel-body\n

Description

Defines a kernel entry point name, parameters, and body for the kernel function.

Parameters are passed via .param space memory and are listed within an optional parenthesized\nparameter list. Parameters may be referenced by name within the kernel body and loaded into\nregisters using ld.param instructions.

In addition to normal parameters, opaque .texref, .samplerref, and .surfref variables\nmay be passed as parameters. These parameters can only be referenced by name within texture and\nsurface load, store, and query instructions and cannot be accessed via ld.param instructions.

The shape and size of the CTA executing the kernel are available in special registers.

Semantics

Specify the entry point for a kernel program.

At kernel launch, the kernel dimensions and properties are established and made available via\nspecial registers, e.g., %ntid, %nctaid, etc.

PTX ISA Notes

For PTX ISA version 1.4 and later, parameter variables are declared in the kernel parameter\nlist. For PTX ISA versions 1.0 through 1.3, parameter variables are declared in the kernel body.

The maximum memory size supported by PTX for normal (non-opaque type) parameters is 32764\nbytes. Depending upon the PTX ISA version, the parameter size limit varies. The following table\nshows the allowed parameter size for a PTX ISA version:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

PTX ISA Version	Maximum parameter size (In bytes)
PTX ISA version 8.1 and above	32764
PTX ISA version 1.5 and above	4352
PTX ISA version 1.4 and above	256

The CUDA and OpenCL drivers support the following limits for parameter memory:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Driver	Parameter memory size
CUDA	256 bytes for `sm_1x`, 4096 bytes for `sm_2x and higher`,\n32764 bytes fo `sm_70` and higher
OpenCL	32764 bytes for `sm_70` and higher, 4352 bytes on `sm_6x`\nand lower

Target ISA Notes

Supported on all target architectures.

Examples

.entry cta_fft\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n{\n    .reg .b32 %r<99>;\n    ld.param.b32  %r1, [x];\n    ld.param.b32  %r2, [y];\n    ld.param.b32  %r3, [z];\n    ...\n}\n\n.entry prefix_sum ( .param .align 4 .s32 pitch[8000] )\n{\n    .reg .s32 %t;\n    ld.param.s32  %t, [pitch];\n    ...\n}\n

", "tooltip": "Kernel entry point and body, with optional parameters.\n\nSyntax\n\n.entry kernel-name ( param-list ) kernel-body\n\n.entry kernel-name kernel-body\n\nDescription\n\nDefines a kernel entry point name, parameters, and body for the kernel function.\n\nParameters are passed via .param space memory and are listed within an optional parenthesized\n\nparameter list. Parameters may be referenced by name within the kernel body and loaded into\n\nregisters using ld.param instructions.\n\nIn addition to normal parameters, opaque .texref, .samplerref, and .surfref variables\n\nmay be passed as parameters. These parameters can only be referenced by name within texture and\n\nsurface load, store, and query instructions and cannot be accessed via ld.param instructions.\n\nThe shape and size of the CTA executing the kernel are available in special registers.\n\nSemantics\n\nSpecify the entry point for a kernel program.\n\nAt kernel launch, the kernel dimensions and properties are established and made available via\n\nspecial registers, e.g., %ntid, %nctaid, etc.\n\nPTX ISA Notes\n\nFor PTX ISA version 1.4 and later, parameter variables are declared in the kernel parameter\n\nlist. For PTX ISA versions 1.0 through 1.3, parameter variables are declared in the kernel body.\n\nThe maximum memory size supported by PTX for normal (non-opaque type) parameters is 32764\n\nbytes. Depending upon the PTX ISA version, the parameter size limit varies. The following table\n\nshows the allowed parameter size for a PTX ISA version:\n\n\n\nPTX ISA Version\n\nMaximum parameter size (In bytes)\n\nPTX ISA version 8.1 and above\n\n32764\n\nPTX ISA version 1.5 and above\n\n4352\n\nPTX ISA version 1.4 and above\n\n256\n\nThe CUDA and OpenCL drivers support the following limits for parameter memory:\n\n\n\nDriver\n\nParameter memory size\n\nCUDA\n\n256 bytes for sm_1x, 4096 bytes for sm_2x and higher,\n\n32764 bytes fo sm_70 and higher\n\nOpenCL\n\n32764 bytes for sm_70 and higher, 4352 bytes on sm_6x\n\nand lower\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry cta_fft\n\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n\n{\n\n .reg .b32 %r<99>;\n\n ld.param.b32 %r1, [x];\n\n ld.param.b32 %r2, [y];\n\n ld.param.b32 %r3, [z];\n\n ...\n\n}\n\n.entry prefix_sum ( .param .align 4 .s32 pitch[8000] )\n\n{\n\n .reg .s32 %t;\n\n ld.param.s32 %t, [pitch];\n\n ...\n\n}\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-entry" }; case "envreg<32>": return { "html": "For more information, visit envreg<32> .

Special Registers: %envreg<32>

\n\n\n

Driver-defined read-only registers.

Syntax (predefined)

.sreg .b32 %envreg<32>;\n

Description

A set of 32 pre-defined read-only registers used to capture execution environment of PTX program\noutside of PTX virtual machine. These registers are initialized by the driver prior to kernel launch\nand can contain cta-wide or grid-wide values.

Precise semantics of these registers is defined in the driver documentation.

PTX ISA Notes

Introduced in PTX ISA version 2.1.

Target ISA Notes

Supported on all target architectures.

Examples

mov.b32      %r1,%envreg0;  // move envreg0 to %r1\n

", "tooltip": "Driver-defined read-only registers.\n\nSyntax (predefined)\n\n.sreg .b32 %envreg<32>;\n\nDescription\n\nA set of 32 pre-defined read-only registers used to capture execution environment of PTX program\n\noutside of PTX virtual machine. These registers are initialized by the driver prior to kernel launch\n\nand can contain cta-wide or grid-wide values.\n\nPrecise semantics of these registers is defined in the driver documentation.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.b32 %r1,%envreg0; // move envreg0 to %r1\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-envreg-32" }; case "ex2": return { "html": "For more information, visit ex2(fp) , ex2(fp16) .

Floating Point Instructions: ex2

\n\n\n

Find the base-2 exponential of a value.

Syntax

ex2.approx{.ftz}.f32  d, a;\n

Description

Raise 2 to the power a.

Semantics

d = 2 ^ a;\n

Notes

ex2.approx.f32 implements a fast approximation to 2^a.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	+0.0
-subnormal	+1.0
-0.0	+1.0
+0.0	+1.0
+subnormal	+1.0
+Inf	+Inf
NaN	NaN

The maximum absolute error is 2^-22.5 for fraction in the primary range.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

ex2.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

Subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

ex2.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\nintroduced in PTX ISA version 1.4.

For PTX ISA version 1.4 and later, the .approx modifier is required.

For PTX ISA versions 1.0 through 1.3, ex2.f32 defaults to ex2.approx.ftz.f32.

Target ISA Notes

Supported on all target architectures.

Examples

ex2.approx.ftz.f32  xa, a;\n

Half Precision Floating Point Instructions: ex2

\n\n\n

Find the base-2 exponent of input.

Syntax

ex2.approx.atype     d, a;\nex2.approx.ftz.btype d, a;\n\n.atype = { .f16,  .f16x2}\n.btype = { .bf16, .bf16x2}\n

Description

Raise 2 to the power a.

The type of operands d and a are as specified by .type.

For .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\nparallel and the results are packed appropriately into a .f16x2 or .bf16x2.

Semantics

if (.type == .f16 || .type == .bf16) {\n  d = 2 ^ a\n} else if (.type == .f16x2 || .type == .bf16x2) {\n  fA[0] = a[0:15];\n  fA[1] = a[16:31];\n  d[0] = 2 ^ fA[0]\n  d[1] = 2 ^ fA[1]\n}\n

Notes

ex2.approx.{f16, f16x2, bf16, bf16x2} implement a fast approximation to 2^a.

For the .f16 type, subnormal inputs are supported. ex2.approx.ftz.bf16 flushes subnormal\ninputs and results to sign-preserving zero.

Results of ex2.approx.ftz.bf16 for various corner-case inputs are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	+0.0
-subnormal	+1.0
-0.0	+1.0
+0.0	+1.0
+subnormal	+1.0
+Inf	+Inf
NaN	NaN

Results of ex2.approx.f16 for various corner-case inputs are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	+0.0
-0.0	+1.0
+0.0	+1.0
+Inf	+Inf
NaN	NaN

The maximum relative error for .f16 type is 2-9.9. The maximum relative error for .bf16 type\nis 2-7.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

ex2.approx.ftz.{bf16/bf16x2} introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_75 or higher.

ex2.approx.ftz.{bf16/bf16x2} requires sm_90 or higher.

Examples

ex2.approx.f16         h1, h0;\nex2.approx.f16x2       hd1, hd0;\nex2.approx.ftz.bf16    b1, b2;\nex2.approx.ftz.bf16x2  hb1, hb2;\n

", "tooltip": "=====Floating Point Instructions: ex2\n\n\n\nFind the base-2 exponential of a value.\n\nSyntax\n\nex2.approx{.ftz}.f32 d, a;\n\nDescription\n\nRaise 2 to the power a.\n\nSemantics\n\nd = 2 ^ a;\n\nNotes\n\nex2.approx.f32 implements a fast approximation to 2a.\n\n\n\nInput\n\nResult\n\n-Inf\n\n+0.0\n\n-subnormal\n\n+1.0\n\n-0.0\n\n+1.0\n\n+0.0\n\n+1.0\n\n+subnormal\n\n+1.0\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-22.5 for fraction in the primary range.\n\nSubnormal numbers:\n\nsm_20+By default, subno...\n\n=====Half Precision Floating Point Instructions: ex2\n\n\n\nFind the base-2 exponent of input.\n\nSyntax\n\nex2.approx.atype d, a;\n\nex2.approx.ftz.btype d, a;\n\n.atype = { .f16, .f16x2}\n\n.btype = { .bf16, .bf16x2}\n\nDescription\n\nRaise 2 to the power a.\n\nThe type of operands d and a are as specified by .type.\n\nFor .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\n\nparallel and the results are packed appropriately into a .f16x2 or .bf16... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2" }; case "exit": return { "html": "For more information, visit exit .

Control Flow Instructions: exit

\n\n\n

Terminate a thread.

Syntax

exit;\n

Description

Ends execution of a thread.

As threads exit, barriers waiting on all threads are checked to see if the exiting threads are the\nonly threads that have not yet made it to a barrier{.cta} for all threads in the CTA. If the exiting threads are holding up the\nbarrier, the barrier is released.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

    exit;\n@p  exit;\n

", "tooltip": "Terminate a thread.\n\nSyntax\n\nexit;\n\nDescription\n\nEnds execution of a thread.\n\nAs threads exit, barriers waiting on all threads are checked to see if the exiting threads are the\n\nonly threads that have not yet made it to a barrier{.cta} for all threads in the CTA. If the exiting threads are holding up the\n\nbarrier, the barrier is released.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n exit;\n\n@p exit;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit" }; case "explicitcluster": return { "html": "For more information, visit explicitcluster .

Cluster Dimension Directives: .explicitcluster

\n\n\n

Declare that Kernel must be launched with cluster dimensions explicitly specified.

Syntax

.explicitcluster\n

Description

Declares that this Kernel should be launched with cluster dimension explicitly specified.

Semantics

Kernels with .explicitcluster directive must be launched with cluster dimension explicitly\nspecified (either at launch time or via .reqnctapercluster), otherwise program will fail with\nruntime error or kernel launch failure.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.entry foo .explicitcluster         { . . . }\n

", "tooltip": "Declare that Kernel must be launched with cluster dimensions explicitly specified.\n\nSyntax\n\n.explicitcluster\n\nDescription\n\nDeclares that this Kernel should be launched with cluster dimension explicitly specified.\n\nSemantics\n\nKernels with .explicitcluster directive must be launched with cluster dimension explicitly\n\nspecified (either at launch time or via .reqnctapercluster), otherwise program will fail with\n\nruntime error or kernel launch failure.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo .explicitcluster { . . . }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-explicitcluster" }; case "extern": return { "html": "For more information, visit extern .

Linking Directives: .extern

\n\n\n

External symbol declaration.

Syntax

.extern identifier\n

Description

Declares identifier to be defined external to the current module. The identifier must be declared\n.visible in the module where it is defined.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

.extern .global .b32 foo;  // foo is defined in another module\n

", "tooltip": "External symbol declaration.\n\nSyntax\n\n.extern identifier\n\nDescription\n\nDeclares identifier to be defined external to the current module. The identifier must be declared\n\n.visible in the module where it is defined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.extern .global .b32 foo; // foo is defined in another module\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-extern" }; case "file": return { "html": "For more information, visit file .

Debugging Directives: .file

\n\n\n

Source file name.

Syntax

.file file_index \"filename\" {, timestamp, file_size}\n

Description

Associates a source filename with an integer index. .loc directives reference source files by\nindex.

.file directive allows optionally specifying an unsigned number representing time of last\nmodification and an unsigned integer representing size in bytes of source file. timestamp and\nfile_size value can be 0 to indicate this information is not available.

timestamp value is in format of C and C++ data type time_t.

file_size is an unsigned 64-bit integer.

The .file directive is allowed only in the outermost scope, i.e., at the same level as kernel\nand device function declarations.

Semantics

If timestamp and file size are not specified, they default to 0.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Timestamp and file size introduced in PTX ISA version 3.2.

Target ISA Notes

Supported on all target architectures.

Examples

.file 1 \"example.cu\"\n.file 2 \"kernel.cu\"\n.file 1 \u201ckernel.cu\u201d, 1339013327, 64118\n

", "tooltip": "Source file name.\n\nSyntax\n\n.file file_index \"filename\" {, timestamp, file_size}\n\nDescription\n\nAssociates a source filename with an integer index. .loc directives reference source files by\n\nindex.\n\n.file directive allows optionally specifying an unsigned number representing time of last\n\nmodification and an unsigned integer representing size in bytes of source file. timestamp and\n\nfile_size value can be 0 to indicate this information is not available.\n\ntimestamp value is in format of C and C++ data type time_t.\n\nfile_size is an unsigned 64-bit integer.\n\nThe .file directive is allowed only in the outermost scope, i.e., at the same level as kernel\n\nand device function declarations.\n\nSemantics\n\nIf timestamp and file size are not specified, they default to 0.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTimestamp and file size introduced in PTX ISA version 3.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.file 1 \"example.cu\"\n\n.file 2 \"kernel.cu\"\n\n.file 1 \u201ckernel.cu\u201d, 1339013327, 64118\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file" }; case "fma": return { "html": "For more information, visit fma(fp) , fma(fp16) .

Floating Point Instructions: fma

\n\n\n

Fused multiply-add.

Syntax

fma.rnd{.ftz}{.sat}.f32  d, a, b, c;\nfma.rnd.f64              d, a, b, c;\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Performs a fused multiply-add with no loss of precision in the intermediate product and addition.

Semantics

d = a*b + c;\n

Notes

fma.f32 computes the product of a and b to infinite precision and then adds c to\nthis product, again in infinite precision. The resulting value is then rounded to single precision\nusing the rounding mode specified by .rnd.

fma.f64 computes the product of a and b to infinite precision and then adds c to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by .rnd.

fma.f64 is the same as mad.f64.

Rounding modifiers (no default):

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

fma.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

fma.f64 supports subnormal numbers.

fma.f32 is unimplemented for sm_1x targets.

Saturation:

fma.sat.f32 clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.

PTX ISA Notes

fma.f64 introduced in PTX ISA version 1.4.

fma.f32 introduced in PTX ISA version 2.0.

Target ISA Notes

fma.f32 requires sm_20 or higher.

fma.f64 requires sm_13 or higher.

Examples

    fma.rn.ftz.f32  w,x,y,z;\n@p  fma.rn.f64      d,a,b,c;\n

Half Precision Floating Point Instructions: fma

\n\n\n

Fused multiply-add

Syntax

fma.rnd{.ftz}{.sat}.f16     d, a, b, c;\nfma.rnd{.ftz}{.sat}.f16x2   d, a, b, c;\nfma.rnd{.ftz}.relu.f16      d, a, b, c;\nfma.rnd{.ftz}.relu.f16x2    d, a, b, c;\nfma.rnd{.relu}.bf16         d, a, b, c;\nfma.rnd{.relu}.bf16x2       d, a, b, c;\nfma.rnd.oob.{relu}.type     d, a, b, c;\n\n.rnd = { .rn };\n

Description

Performs a fused multiply-add with no loss of precision in the intermediate product and addition.

For .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then operated in parallel to produce .f16x2 or .bf16x2\nresult in destination.

For .f16 instruction type, operands d, a, b and c have .f16 or .b16\ntype. For .f16x2 instruction type, operands d, a, b and c have .b32\ntype. For .bf16 instruction type, operands d, a, b and c have .b16 type. For\n.bf16x2 instruction type, operands d, a, b and c have .b32 type.

Semantics

if (type == f16 || type == bf16) {\n    d = a * b + c;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    fC[0] = c[0:15];\n    fC[1] = c[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = fA[i] * fB[i] + fC[i];\n    }\n}\n

Notes

Rounding modifiers (default is .rn):

.rn: mantissa LSB rounds to nearest even
\n
Subnormal numbers:: By default, subnormal numbers are supported.\nfma.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n
Saturation modifier:: fma.sat.{f16, f16x2} clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.\nfma.relu.{f16, f16x2, bf16, bf16x2} clamps the result to 0 if negative. NaN result is\nconverted to canonical NaN.
\n
Out Of Bounds modifier:: fma.oob.{f16, f16x2, bf16, bf16x2} clamps the result to 0 if either of the operands\nis OOB NaN (defined under Tensors) value. The test for the special NaN value\nand resultant forcing of the result to +0.0 is performed independently for each of the\ntwo SIMD operations.
\n

PTX ISA Notes

Introduced in PTX ISA version 4.2.

fma.relu.{f16, f16x2} and fma{.relu}.{bf16, bf16x2} introduced in PTX ISA version 7.0.

Support for modifier .oob introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_53 or higher.

fma.relu.{f16, f16x2} and fma{.relu}.{bf16, bf16x2} require sm_80 or higher.

fma{.oob}.{f16, f16x2, bf16, bf16x2} requires sm_90 or higher.

Examples

// scalar f16 fused multiply-add\nfma.rn.f16         d0, a0, b0, c0;\nfma.rn.f16         d1, a1, b1, c1;\nfma.rn.relu.f16    d1, a1, b1, c1;\nfma.rn.oob.f16      d1, a1, b1, c1;\nfma.rn.oob.relu.f16 d1, a1, b1, c1;\n\n// scalar bf16 fused multiply-add\nfma.rn.bf16        d1, a1, b1, c1;\nfma.rn.relu.bf16   d1, a1, b1, c1;\nfma.rn.oob.bf16       d1, a1, b1, c1;\nfma.rn.oob.relu.bf16  d1, a1, b1, c1;\n\n// SIMD f16 fused multiply-add\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1}; // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3}; // pack two f16 to 32bit f16x2\nfma.rn.f16x2  p3, p1, p2, p2;   // SIMD f16x2 fused multiply-add\nfma.rn.relu.f16x2  p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with relu saturation mode\nfma.rn.oob.f16x2  p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with oob modifier\nfma.rn.oob.relu.f16x2 p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with oob modifier and relu saturation mode\n\n// SIMD fp16 fused multiply-add\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nfma.rn.f16x2    f2, f0, f1, f1; // SIMD f16x2 fused multiply-add\n\n// SIMD bf16 fused multiply-add\nfma.rn.bf16x2       f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add\nfma.rn.relu.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with relu saturation mode\nfma.rn.oob.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with oob modifier\nfma.rn.oob.relu.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with oob modifier and relu saturation mode\n

", "tooltip": "=====Floating Point Instructions: fma\n\n\n\nFused multiply-add.\n\nSyntax\n\nfma.rnd{.ftz}{.sat}.f32 d, a, b, c;\n\nfma.rnd.f64 d, a, b, c;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms a fused multiply-add with no loss of precision in the intermediate product and addition.\n\nSemantics\n\nd = a*b + c;\n\nNotes\n\nfma.f32 computes the product of a and b to infinite precision and then adds c to\n\nthis product, again in infinite precision. The r...\n\n=====Half Precision Floating Point Instructions: fma\n\n\n\nFused multiply-add\n\nSyntax\n\nfma.rnd{.ftz}{.sat}.f16 d, a, b, c;\n\nfma.rnd{.ftz}{.sat}.f16x2 d, a, b, c;\n\nfma.rnd{.ftz}.relu.f16 d, a, b, c;\n\nfma.rnd{.ftz}.relu.f16x2 d, a, b, c;\n\nfma.rnd{.relu}.bf16 d, a, b, c;\n\nfma.rnd{.relu}.bf16x2 d, a, b, c;\n\nfma.rnd.oob.{relu}.type d, a, b, c;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms a fused multiply-add with no loss of precision in the int... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma" }; case "fns": return { "html": "For more information, visit fns(int) .

Integer Arithmetic Instructions: fns

\n\n\n

Find the n-th set bit

Syntax

fns.b32 d, mask, base, offset;\n

Description

Given a 32-bit value mask and an integer value base (between 0 and 31), find the n-th (given\nby offset) set bit in mask from the base bit, and store the bit position in d. If not\nfound, store 0xffffffff in d.

Operand mask has a 32-bit type. Operand base has .b32, .u32 or .s32\ntype. Operand offset has .s32 type. Destination d has type .b32.

Operand base must be <= 31, otherwise behavior is undefined.

Semantics

d = 0xffffffff;\nif (offset == 0) {\n    if (mask[base] == 1) {\n        d = base;\n    }\n} else {\n    pos = base;\n    count = |offset| - 1;\n    inc = (offset > 0) ? 1 : -1;\n\n    while ((pos >= 0) && (pos < 32)) {\n        if (mask[pos] == 1) {\n            if (count == 0) {\n              d = pos;\n              break;\n           } else {\n               count = count \u2013 1;\n           }\n        }\n        pos = pos + inc;\n    }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 6.0.

Target ISA Notes

fns requires sm_30 or higher.

Examples

fns.b32 d, 0xaaaaaaaa, 3, 1;   // d = 3\nfns.b32 d, 0xaaaaaaaa, 3, -1;  // d = 3\nfns.b32 d, 0xaaaaaaaa, 2, 1;   // d = 3\nfns.b32 d, 0xaaaaaaaa, 2, -1;  // d = 1\n

", "tooltip": "Find the n-th set bit\n\nSyntax\n\nfns.b32 d, mask, base, offset;\n\nDescription\n\nGiven a 32-bit value mask and an integer value base (between 0 and 31), find the n-th (given\n\nby offset) set bit in mask from the base bit, and store the bit position in d. If not\n\nfound, store 0xffffffff in d.\n\nOperand mask has a 32-bit type. Operand base has .b32, .u32 or .s32\n\ntype. Operand offset has .s32 type. Destination d has type .b32.\n\nOperand base must be <= 31, otherwise behavior is undefined.\n\nSemantics\n\nd = 0xffffffff;\n\nif (offset == 0) {\n\n if (mask[base] == 1) {\n\n d = base;\n\n }\n\n} else {\n\n pos = base;\n\n count = |offset| - 1;\n\n inc = (offset > 0) ? 1 : -1;\n\n while ((pos >= 0) && (pos < 32)) {\n\n if (mask[pos] == 1) {\n\n if (count == 0) {\n\n d = pos;\n\n break;\n\n } else {\n\n count = count \u2013 1;\n\n }\n\n }\n\n pos = pos + inc;\n\n }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nfns requires sm_30 or higher.\n\nExamples\n\nfns.b32 d, 0xaaaaaaaa, 3, 1; // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 3, -1; // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 2, 1; // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 2, -1; // d = 1\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns" }; case "func": return { "html": "For more information, visit func .

Kernel and Function Directives: .func

\n\n\n

Function definition.

Syntax

.func {.attribute(attr-list)} fname {.noreturn} function-body\n.func {.attribute(attr-list)} fname (param-list) {.noreturn} function-body\n.func {.attribute(attr-list)} (ret-param) fname (param-list) function-body\n

Description

Defines a function, including input and return parameters and optional function body.

An optional .noreturn directive indicates that the function does not return to the caller\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.

An optional .attribute directive specifies additional information associated with the\nfunction. See the description of Variable and Function Attribute Directive: .attribute for allowed attributes.

A .func definition with no body provides a function prototype.

The parameter lists define locally-scoped variables in the function body. Parameters must be base\ntypes in either the register or parameter state space. Parameters in register state space may be\nreferenced directly within instructions in the function body. Parameters in .param space are\naccessed using ld.param and st.param instructions in the body. Parameter passing is\ncall-by-value.

The last parameter in the parameter list may be a .param array of type .b8 with no size\nspecified. It is used to pass an arbitrary number of parameters to the function packed into a single\narray object.

When calling a function with such an unsized last argument, the last argument may be omitted from\nthe call instruction if no parameter is passed through it. Accesses to this array parameter must\nbe within the bounds of the array. The result of an access is undefined if no array was passed, or\nif the access was outside the bounds of the actual array being passed.

Semantics

The PTX syntax hides all details of the underlying calling convention and ABI.

The implementation of parameter passing is left to the optimizing translator, which may use a\ncombination of registers and stack locations to pass parameters.

Release Notes

For PTX ISA version 1.x code, parameters must be in the register state space, there is no stack, and\nrecursion is illegal.

PTX ISA versions 2.0 and later with target sm_20 or higher allow parameters in the .param\nstate space, implements an ABI with stack, and supports recursion.

PTX ISA versions 2.0 and later with target sm_20 or higher support at most one return value.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Support for unsized array parameter introduced in PTX ISA version 6.0.

Support for .noreturn directive introduced in PTX ISA version 6.4.

Support for .attribute directive introduced in PTX ISA version 8.0.

Target ISA Notes

Functions without unsized array parameter supported on all target architectures.

Unsized array parameter requires sm_30 or higher.

.noreturn directive requires sm_30 or higher.

.attribute directive requires sm_90 or higher.

Examples

.func (.reg .b32 rval) foo (.reg .b32 N, .reg .f64 dbl)\n{\n.reg .b32 localVar;\n\n... use N, dbl;\nother code;\n\nmov.b32 rval,result;\nret;\n}\n\n...\ncall (fooval), foo, (val0, val1);  // return value in fooval\n...\n\n.func foo (.reg .b32 N, .reg .f64 dbl) .noreturn\n{\n.reg .b32 localVar;\n... use N, dbl;\nother code;\nmov.b32 rval, result;\nret;\n}\n...\ncall foo, (val0, val1);\n...\n\n.func (.param .u32 rval) bar(.param .u32 N, .param .align 4 .b8 numbers[])\n{\n    .reg .b32 input0, input1;\n    ld.param.b32   input0, [numbers + 0];\n    ld.param.b32   input1, [numbers + 4];\n    ...\n    other code;\n    ret;\n}\n...\n\n.param .u32 N;\n.param .align 4 .b8 numbers[8];\nst.param.u32    [N], 2;\nst.param.b32    [numbers + 0], 5;\nst.param.b32    [numbers + 4], 10;\ncall (rval), bar, (N, numbers);\n...\n

", "tooltip": "Function definition.\n\nSyntax\n\n.func {.attribute(attr-list)} fname {.noreturn} function-body\n\n.func {.attribute(attr-list)} fname (param-list) {.noreturn} function-body\n\n.func {.attribute(attr-list)} (ret-param) fname (param-list) function-body\n\nDescription\n\nDefines a function, including input and return parameters and optional function body.\n\nAn optional .noreturn directive indicates that the function does not return to the caller\n\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\n\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.\n\nAn optional .attribute directive specifies additional information associated with the\n\nfunction. See the description of Variable and Function Attribute Directive: .attribute for allowed attributes.\n\nA .func definition with no body provides a function prototype.\n\nThe parameter lists define locally-scoped variables in the function body. Parameters must be base\n\ntypes in either the register or parameter state space. Parameters in register state space may be\n\nreferenced directly within instructions in the function body. Parameters in .param space are\n\naccessed using ld.param and st.param instructions in the body. Parameter passing is\n\ncall-by-value.\n\nThe last parameter in the parameter list may be a .param array of type .b8 with no size\n\nspecified. It is used to pass an arbitrary number of parameters to the function packed into a single\n\narray object.\n\nWhen calling a function with such an unsized last argument, the last argument may be omitted from\n\nthe call instruction if no parameter is passed through it. Accesses to this array parameter must\n\nbe within the bounds of the array. The result of an access is undefined if no array was passed, or\n\nif the access was outside the bounds of the actual array being passed.\n\nSemantics\n\nThe PTX syntax hides all details of the underlying calling convention and ABI.\n\nThe implementation of parameter passing is left to the optimizing translator, which may use a\n\ncombination of registers and stack locations to pass parameters.\n\nRelease Notes\n\nFor PTX ISA version 1.x code, parameters must be in the register state space, there is no stack, and\n\nrecursion is illegal.\n\nPTX ISA versions 2.0 and later with target sm_20 or higher allow parameters in the .param\n\nstate space, implements an ABI with stack, and supports recursion.\n\nPTX ISA versions 2.0 and later with target sm_20 or higher support at most one return value.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nSupport for unsized array parameter introduced in PTX ISA version 6.0.\n\nSupport for .noreturn directive introduced in PTX ISA version 6.4.\n\nSupport for .attribute directive introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nFunctions without unsized array parameter supported on all target architectures.\n\nUnsized array parameter requires sm_30 or higher.\n\n.noreturn directive requires sm_30 or higher.\n\n.attribute directive requires sm_90 or higher.\n\nExamples\n\n.func (.reg .b32 rval) foo (.reg .b32 N, .reg .f64 dbl)\n\n{\n\n.reg .b32 localVar;\n\n... use N, dbl;\n\nother code;\n\nmov.b32 rval,result;\n\nret;\n\n}\n\n...\n\ncall (fooval), foo, (val0, val1); // return value in fooval\n\n...\n\n.func foo (.reg .b32 N, .reg .f64 dbl) .noreturn\n\n{\n\n.reg .b32 localVar;\n\n... use N, dbl;\n\nother code;\n\nmov.b32 rval, result;\n\nret;\n\n}\n\n...\n\ncall foo, (val0, val1);\n\n...\n\n.func (.param .u32 rval) bar(.param .u32 N, .param .align 4 .b8 numbers[])\n\n{\n\n .reg .b32 input0, input1;\n\n ld.param.b32 input0, [numbers + 0];\n\n ld.param.b32 input1, [numbers + 4];\n\n ...\n\n other code;\n\n ret;\n\n}\n\n...\n\n.param .u32 N;\n\n.param .align 4 .b8 numbers[8];\n\nst.param.u32 [N], 2;\n\nst.param.b32 [numbers + 0], 5;\n\nst.param.b32 [numbers + 4], 10;\n\ncall (rval), bar, (N, numbers);\n\n...\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-func" }; case "getctarank": return { "html": "For more information, visit getctarank .

Data Movement and Conversion Instructions: getctarank

\n\n\n

Generate the CTA rank of the address.

Syntax

getctarank{.space}.type d, a;\n\n// Get cta rank from source shared memory address in register a.\ngetctarank.shared::cluster.type d, a;\n\n// Get cta rank from shared memory variable.\ngetctarank.shared::cluster.type d, var;\n\n// Get cta rank from shared memory variable+offset.\ngetctarank.shared::cluster.type d, var + imm;\n\n// Get cta rank from generic address of shared memory variable in register a.\ngetctarank.type d, a;\n\n.space = { .shared::cluster }\n.type  = { .u32, .u64 }\n

Description

Write the destination register d with the rank of the CTA which contains the address specified\nin operand a.

Instruction type .type indicates the type of source operand a.

When space is .shared::cluster, source a is either a shared memory variable or a register\ncontaining a valid shared memory address. When the optional qualifier .space is not specified,\na is a register containing a generic addresses pointing to shared memory. Destination d is\nalways a 32-bit register which holds the rank of the CTA.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

getctarank.shared::cluster.u32 d1, addr;\ngetctarank.shared::cluster.u64 d2, sh + 4;\ngetctarank.u64                 d3, src;\n

", "tooltip": "Generate the CTA rank of the address.\n\nSyntax\n\ngetctarank{.space}.type d, a;\n\n// Get cta rank from source shared memory address in register a.\n\ngetctarank.shared::cluster.type d, a;\n\n// Get cta rank from shared memory variable.\n\ngetctarank.shared::cluster.type d, var;\n\n// Get cta rank from shared memory variable+offset.\n\ngetctarank.shared::cluster.type d, var + imm;\n\n// Get cta rank from generic address of shared memory variable in register a.\n\ngetctarank.type d, a;\n\n.space = { .shared::cluster }\n\n.type = { .u32, .u64 }\n\nDescription\n\nWrite the destination register d with the rank of the CTA which contains the address specified\n\nin operand a.\n\nInstruction type .type indicates the type of source operand a.\n\nWhen space is .shared::cluster, source a is either a shared memory variable or a register\n\ncontaining a valid shared memory address. When the optional qualifier .space is not specified,\n\na is a register containing a generic addresses pointing to shared memory. Destination d is\n\nalways a 32-bit register which holds the rank of the CTA.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\ngetctarank.shared::cluster.u32 d1, addr;\n\ngetctarank.shared::cluster.u64 d2, sh + 4;\n\ngetctarank.u64 d3, src;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank" }; case "globaltimer": return { "html": "For more information, visit globaltimer .

Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi

\n\n\n

%globaltimer: A predefined, 64-bit global nanosecond timer.
\n
%globaltimer_lo: The lower 32-bits of %globaltimer.
\n
%globaltimer_hi: The upper 32-bits of %globaltimer.
\n

Syntax (predefined)

.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n

Description

Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.

PTX ISA Notes

Introduced in PTX ISA version 3.1.

Target ISA Notes

Requires target sm_30 or higher.

Examples

mov.u64  r1,%globaltimer;\n

", "tooltip": "%globaltimerA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_loThe lower 32-bits of %globaltimer.\n\n%globaltimer_hiThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64 r1,%globaltimer;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi" }; case "globaltimer_hi": return { "html": "For more information, visit globaltimer_hi .

Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi

\n\n\n

%globaltimer: A predefined, 64-bit global nanosecond timer.
\n
%globaltimer_lo: The lower 32-bits of %globaltimer.
\n
%globaltimer_hi: The upper 32-bits of %globaltimer.
\n

Syntax (predefined)

.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n

Description

PTX ISA Notes

Introduced in PTX ISA version 3.1.

Target ISA Notes

Requires target sm_30 or higher.

Examples

mov.u64  r1,%globaltimer;\n

", "tooltip": "%globaltimerA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_loThe lower 32-bits of %globaltimer.\n\n%globaltimer_hiThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64 r1,%globaltimer;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi" }; case "globaltimer_lo": return { "html": "For more information, visit globaltimer_lo .

Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi

\n\n\n

%globaltimer: A predefined, 64-bit global nanosecond timer.
\n
%globaltimer_lo: The lower 32-bits of %globaltimer.
\n
%globaltimer_hi: The upper 32-bits of %globaltimer.
\n

Syntax (predefined)

.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n

Description

PTX ISA Notes

Introduced in PTX ISA version 3.1.

Target ISA Notes

Requires target sm_30 or higher.

Examples

mov.u64  r1,%globaltimer;\n

", "tooltip": "%globaltimerA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_loThe lower 32-bits of %globaltimer.\n\n%globaltimer_hiThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64 r1,%globaltimer;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi" }; case "griddepcontrol": return { "html": "For more information, visit griddepcontrol .

Parallel Synchronization and Communication Instructions: griddepcontrol

\n\n\n

Control execution of dependent grids.

Syntax

griddepcontrol.action;\n\n.action   = { .launch_dependents, .wait }\n

Description

The griddepcontrol instruction allows the dependent grids and prerequisite grids as defined by\nthe runtime, to control execution in the following way:

.launch_dependents modifier signals that specific dependents the runtime system designated to\nreact to this instruction can be scheduled as soon as all other CTAs in the grid issue the same\ninstruction or have completed. The dependent may launch before the completion of the current\ngrid. There is no guarantee that the dependent will launch before the completion of the current\ngrid. Repeated invocations of this instruction by threads in the current CTA will have no additional\nside effects past that of the first invocation.

.wait modifier causes the executing thread to wait until all prerequisite grids in flight have\ncompleted and all the memory operations from the prerequisite grids are performed and made visible\nto the current grid.

Note

If the prerequisite grid is using griddepcontrol.launch_dependents, then the dependent grid\nmust use griddepcontrol.wait to ensure correct functional execution.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

griddepcontrol.launch_dependents;\ngriddepcontrol.wait;\n

", "tooltip": "Control execution of dependent grids.\n\nSyntax\n\ngriddepcontrol.action;\n\n.action = { .launch_dependents, .wait }\n\nDescription\n\nThe griddepcontrol instruction allows the dependent grids and prerequisite grids as defined by\n\nthe runtime, to control execution in the following way:\n\n.launch_dependents modifier signals that specific dependents the runtime system designated to\n\nreact to this instruction can be scheduled as soon as all other CTAs in the grid issue the same\n\ninstruction or have completed. The dependent may launch before the completion of the current\n\ngrid. There is no guarantee that the dependent will launch before the completion of the current\n\ngrid. Repeated invocations of this instruction by threads in the current CTA will have no additional\n\nside effects past that of the first invocation.\n\n.wait modifier causes the executing thread to wait until all prerequisite grids in flight have\n\ncompleted and all the memory operations from the prerequisite grids are performed and made visible\n\nto the current grid.\n\nNote\n\nIf the prerequisite grid is using griddepcontrol.launch_dependents, then the dependent grid\n\nmust use griddepcontrol.wait to ensure correct functional execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\ngriddepcontrol.launch_dependents;\n\ngriddepcontrol.wait;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol" }; case "gridid": return { "html": "For more information, visit gridid .

Special Registers: %gridid

\n\n\n

Grid identifier.

Syntax (predefined)

.sreg .u64 %gridid;\n

Description

A predefined, read-only special register initialized with the per-grid temporal grid identifier. The\n%gridid is used by debuggers to distinguish CTAs and clusters within concurrent (small) grids.

During execution, repeated launches of programs may occur, where each launch starts a\ngrid-of-CTAs. This variable provides the temporal grid launch number for this context.

For sm_1x targets, %gridid is limited to the range [0..2¹⁶-1]. For sm_20,\n%gridid is limited to the range [0..2³²-1]. sm_30 supports the entire 64-bit range.

PTX ISA Notes

Introduced in PTX ISA version 1.0 as type .u16.

Redefined as type .u32 in PTX ISA version 1.3.

Redefined as type .u64 in PTX ISA version 3.0.

For compatibility with legacy PTX code, 16-bit and 32-bit mov and cvt instructions may be\nused to read the lower 16-bits or 32-bits of each component of %gridid.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u64  %s, %gridid;  // 64-bit read of %gridid\nmov.u32  %r, %gridid;  // legacy code with 32-bit %gridid\n

", "tooltip": "Grid identifier.\n\nSyntax (predefined)\n\n.sreg .u64 %gridid;\n\nDescription\n\nA predefined, read-only special register initialized with the per-grid temporal grid identifier. The\n\n%gridid is used by debuggers to distinguish CTAs and clusters within concurrent (small) grids.\n\nDuring execution, repeated launches of programs may occur, where each launch starts a\n\ngrid-of-CTAs. This variable provides the temporal grid launch number for this context.\n\nFor sm_1x targets, %gridid is limited to the range [0..216-1]. For sm_20,\n\n%gridid is limited to the range [0..232-1]. sm_30 supports the entire 64-bit range.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 as type .u16.\n\nRedefined as type .u32 in PTX ISA version 1.3.\n\nRedefined as type .u64 in PTX ISA version 3.0.\n\nFor compatibility with legacy PTX code, 16-bit and 32-bit mov and cvt instructions may be\n\nused to read the lower 16-bits or 32-bits of each component of %gridid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u64 %s, %gridid; // 64-bit read of %gridid\n\nmov.u32 %r, %gridid; // legacy code with 32-bit %gridid\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-gridid" }; case "is_explicit_cluster": return { "html": "For more information, visit is_explicit_cluster .

Special Registers: %is_explicit_cluster

\n\n\n

Checks if user has explicitly specified cluster launch.

Syntax (predefined)

.sreg .pred %is_explicit_cluster;\n

Description

A predefined, read-only special register initialized with the predicate value of whether the cluster\nlaunch is explicitly specified by user.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .pred p;\n\nmov.pred  p, %is_explicit_cluster;\n

", "tooltip": "Checks if user has explicitly specified cluster launch.\n\nSyntax (predefined)\n\n.sreg .pred %is_explicit_cluster;\n\nDescription\n\nA predefined, read-only special register initialized with the predicate value of whether the cluster\n\nlaunch is explicitly specified by user.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .pred p;\n\nmov.pred p, %is_explicit_cluster;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-is-explicit-cluster" }; case "isspacep": return { "html": "For more information, visit isspacep .

Data Movement and Conversion Instructions: isspacep

\n\n\n

Query whether a generic address falls within a specified state space window.

Syntax

isspacep.space  p, a;    // result is .pred\n\n.space = { const, .global, .local, .shared{::cta, ::cluster}, .param };\n

Description

Write predicate register p with 1 if generic address a falls within the specified state\nspace window and with 0 otherwise. Destination p has type .pred; the source address\noperand must be of type .u32 or .u64.

isspacep.param returns 1 if the generic address falls within the window of Kernel Function\nParameters, otherwise returns 0.

isspacep.global returns 1 for Kernel Function Parameters as .param window is contained within the .global\nwindow.

If no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.

Note

ispacep.shared::cluster will return 1 for every shared memory address that is accessible to\nthe threads in the cluster, whereas ispacep.shared::cta will return 1 only if the address is\nof a variable declared in the executing CTA.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

isspacep.const introduced in PTX ISA version 3.1.

isspacep.param introduced in PTX ISA version 7.7.

Support for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.

Target ISA Notes

isspacep requires sm_20 or higher.

isspacep.param requires sm_70 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Examples

isspacep.const           iscnst, cptr;\nisspacep.global          isglbl, gptr;\nisspacep.local           islcl,  lptr;\nisspacep.shared          isshrd, sptr;\nisspacep.param           isparam, pptr;\nisspacep.shared::cta     isshrdcta, sptr;\nisspacep.shared::cluster ishrdany sptr;\n

", "tooltip": "Query whether a generic address falls within a specified state space window.\n\nSyntax\n\nisspacep.space p, a; // result is .pred\n\n.space = { const, .global, .local, .shared{::cta, ::cluster}, .param };\n\nDescription\n\nWrite predicate register p with 1 if generic address a falls within the specified state\n\nspace window and with 0 otherwise. Destination p has type .pred; the source address\n\noperand must be of type .u32 or .u64.\n\nisspacep.param returns 1 if the generic address falls within the window of Kernel Function\n\nParameters, otherwise returns 0.\n\nisspacep.global returns 1 for Kernel Function Parameters as .param window is contained within the .global\n\nwindow.\n\nIf no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.\n\nNote\n\nispacep.shared::cluster will return 1 for every shared memory address that is accessible to\n\nthe threads in the cluster, whereas ispacep.shared::cta will return 1 only if the address is\n\nof a variable declared in the executing CTA.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nisspacep.const introduced in PTX ISA version 3.1.\n\nisspacep.param introduced in PTX ISA version 7.7.\n\nSupport for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nisspacep requires sm_20 or higher.\n\nisspacep.param requires sm_70 or higher.\n\nSub-qualifier ::cta requires sm_30 or higher.\n\nSub-qualifier ::cluster requires sm_90 or higher.\n\nExamples\n\nisspacep.const iscnst, cptr;\n\nisspacep.global isglbl, gptr;\n\nisspacep.local islcl, lptr;\n\nisspacep.shared isshrd, sptr;\n\nisspacep.param isparam, pptr;\n\nisspacep.shared::cta isshrdcta, sptr;\n\nisspacep.shared::cluster ishrdany sptr;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep" }; case "istypep": return { "html": "For more information, visit istypep .

Texture Instructions: istypep

\n\n\n

Query whether a register points to an opaque variable of a specified type.

Syntax

istypep.type   p, a;  // result is .pred\n\n.type = { .texref, .samplerref, .surfref };\n

Description

Write predicate register p with 1 if register a points to an opaque variable of the\nspecified type, and with 0 otherwise. Destination p has type .pred; the source address\noperand must be of type .u64.

PTX ISA Notes

Introduced in PTX ISA version 4.0.

Target ISA Notes

istypep requires sm_30 or higher.

Examples

istypep.texref istex, tptr;\nistypep.samplerref issampler, sptr;\nistypep.surfref issurface, surfptr;\n

", "tooltip": "Query whether a register points to an opaque variable of a specified type.\n\nSyntax\n\nistypep.type p, a; // result is .pred\n\n.type = { .texref, .samplerref, .surfref };\n\nDescription\n\nWrite predicate register p with 1 if register a points to an opaque variable of the\n\nspecified type, and with 0 otherwise. Destination p has type .pred; the source address\n\noperand must be of type .u64.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.0.\n\nTarget ISA Notes\n\nistypep requires sm_30 or higher.\n\nExamples\n\nistypep.texref istex, tptr;\n\nistypep.samplerref issampler, sptr;\n\nistypep.surfref issurface, surfptr;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep" }; case "laneid": return { "html": "For more information, visit laneid .

Special Registers: %laneid

\n\n\n

Lane Identifier.

Syntax (predefined)

.sreg .u32 %laneid;\n

Description

A predefined, read-only special register that returns the thread\u2019s lane within the warp. The lane\nidentifier ranges from zero to WARP_SZ-1.

PTX ISA Notes

Introduced in PTX ISA version 1.3.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u32  %r, %laneid;\n

", "tooltip": "Lane Identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %laneid;\n\nDescription\n\nA predefined, read-only special register that returns the thread\u2019s lane within the warp. The lane\n\nidentifier ranges from zero to WARP_SZ-1.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32 %r, %laneid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-laneid" }; case "lanemask_eq": return { "html": "For more information, visit lanemask_eq .

Special Registers: %lanemask_eq

\n\n\n

32-bit mask with bit set in position equal to the thread\u2019s lane number in the warp.

Syntax (predefined)

.sreg .u32 %lanemask_eq;\n

Description

A predefined, read-only special register initialized with a 32-bit mask with a bit set in the\nposition equal to the thread\u2019s lane number in the warp.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%lanemask_eq requires sm_20 or higher.

Examples

mov.u32     %r, %lanemask_eq;\n

", "tooltip": "32-bit mask with bit set in position equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_eq;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with a bit set in the\n\nposition equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_eq requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %lanemask_eq;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-eq" }; case "lanemask_ge": return { "html": "For more information, visit lanemask_ge .

Special Registers: %lanemask_ge

\n\n\n

32-bit mask with bits set in positions greater than or equal to the thread\u2019s lane number in the warp.

Syntax (predefined)

.sreg .u32 %lanemask_ge;\n

Description

A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\ngreater than or equal to the thread\u2019s lane number in the warp.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%lanemask_ge requires sm_20 or higher.

Examples

mov.u32     %r, %lanemask_ge;\n

", "tooltip": "32-bit mask with bits set in positions greater than or equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_ge;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\ngreater than or equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_ge requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %lanemask_ge;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-ge" }; case "lanemask_gt": return { "html": "For more information, visit lanemask_gt .

Special Registers: %lanemask_gt

\n\n\n

32-bit mask with bits set in positions greater than the thread\u2019s lane number in the warp.

Syntax (predefined)

.sreg .u32 %lanemask_gt;\n

Description

A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\ngreater than the thread\u2019s lane number in the warp.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%lanemask_gt requires sm_20 or higher.

Examples

mov.u32     %r, %lanemask_gt;\n

", "tooltip": "32-bit mask with bits set in positions greater than the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_gt;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\ngreater than the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_gt requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %lanemask_gt;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-gt" }; case "lanemask_le": return { "html": "For more information, visit lanemask_le .

Special Registers: %lanemask_le

\n\n\n

32-bit mask with bits set in positions less than or equal to the thread\u2019s lane number in the warp.

Syntax (predefined)

.sreg .u32 %lanemask_le;\n

Description

A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\nless than or equal to the thread\u2019s lane number in the warp.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%lanemask_le requires sm_20 or higher.

Examples

mov.u32     %r, %lanemask_le\n

", "tooltip": "32-bit mask with bits set in positions less than or equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_le;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\nless than or equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_le requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %lanemask_le\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-le" }; case "lanemask_lt": return { "html": "For more information, visit lanemask_lt .

Special Registers: %lanemask_lt

\n\n\n

32-bit mask with bits set in positions less than the thread\u2019s lane number in the warp.

Syntax (predefined)

.sreg .u32 %lanemask_lt;\n

Description

A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\nless than the thread\u2019s lane number in the warp.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%lanemask_lt requires sm_20 or higher.

Examples

mov.u32     %r, %lanemask_lt;\n

", "tooltip": "32-bit mask with bits set in positions less than the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_lt;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\nless than the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_lt requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %lanemask_lt;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-lt" }; case "ld": return { "html": "For more information, visit ld , ld.global.nc .

Data Movement and Conversion Instructions: ld

\n\n\n

Load a register variable from an addressable state space variable.

Syntax

ld{.weak}{.ss}{.cop}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld.volatile{.ss}{.level::prefetch_size}{.vec}.type  d, [a];\n\nld.relaxed.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{, cache-policy};\n\nld.acquire.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{, cache-policy};\n\n.ss =                       { .const, .global, .local, .param, .shared{::cta, ::cluster} };\n.cop =                      { .ca, .cg, .cs, .lu, .cv };\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate };\n.level::cache_hint =        { .L2::cache_hint };\n.level::prefetch_size =     { .L2::64B, .L2::128B, .L2::256B }\n.scope =                    { .cta, .cluster, .gpu, .sys };\n.vec =                      { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n

Description

Load register variable d from the location specified by the source address operand a in\nspecified state space. If no state space is given, perform the load using Generic Addressing.

If no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

Instruction ld.param used for reading value returned from device function call cannot be\npredicated. See Parameter State Space and Function\nDeclarations and Definitions for descriptions\nof the proper use of ld.param.

The .relaxed and .acquire qualifiers indicate memory synchronization as described in the\nMemory Consistency Model. The .scope qualifier\nindicates the set of threads with which an ld.relaxed or ld.acquire instruction can directly\nsynchronize¹. The .weak qualifier indicates a memory instruction with no\nsynchronization. The effects of this instruction become visible to other threads only when\nsynchronization is established by other means.

The .weak, .volatile, .relaxed and .acquire qualifiers are mutually exclusive. When\nnone of these is specified, the .weak qualifier is assumed by default.

An ld.volatile operation is always performed and it will not be reordered with respect to other\nvolatile operations to the same memory location. volatile and non-volatile load operations\nto the same memory location may be reordered. ld.volatile has the same memory synchronization\nsemantics as ld.relaxed.sys.

The qualifiers .volatile, .relaxed and .acquire may be used only with .global and\n.shared spaces and with generic addressing, where the address points to .global or\n.shared space. Cache operations are not permitted with these qualifiers.

The optional qualifier .unified must be specified on operand a if a is the address of a\nvariable declared with .unified attribute as described in Variable and Function Attribute\nDirective: .attribute.

The qualifier .level::eviction_priority specifies the eviction policy that will be used during\nmemory access.

The .level::prefetch_size qualifier is treated as a performance hint only.

The qualifiers .unified and .level::cache_hint are only supported for .global state\nspace and for generic addressing where the address points to the .global state space.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

¹ This synchronization is further extended to other threads through the transitive nature of\ncausality order, as described in the memory consistency model.

Semantics

d = a;             // named variable a\nd = *(&a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n

Notes

Destination d must be in the .reg state space.

A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types. See\nTable 25\nfor a description of these relaxed type-checking rules.

.f16 data may be loaded using ld.b16, and then converted to .f32 or .f64 using\ncvt or can be used in half precision floating point instructions.

.f16x2 data may be loaded using ld.b32 and then used in half precision floating point\ninstructions.

PTX ISA Notes

ld introduced in PTX ISA version 1.0. ld.volatile introduced in PTX ISA version 1.1.

Generic addressing and cache operations introduced in PTX ISA version 2.0.

Support for scope qualifier, .relaxed, .acquire, .weak qualifiers introduced in PTX ISA\nversion 6.0.

Support for generic addressing of .const space added in PTX ISA version 3.1.

Support for .level::eviction_priority, .level::prefetch_size and .level::cache_hint\nqualifiers introduced in PTX ISA version 7.4.

Support for .cluster scope qualifier introduced in PTX ISA version 7.8.

Support for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.

Support for .unified qualifier introduced in PTX ISA version 8.0.

Target ISA Notes

ld.f64 requires sm_13 or higher.

Support for scope qualifier, .relaxed, .acquire, .weak qualifiers require sm_70 or\nhigher.

Generic addressing requires sm_20 or higher.

Cache operations require sm_20 or higher.

Support for .level::eviction_priority qualifier requires sm_70 or higher.

Support for .level::prefetch_size qualifier requires sm_75 or higher.

Support for .L2::256B and .L2::cache_hint qualifiers requires sm_80 or higher.

Support for .cluster scope qualifier requires sm_90 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for .unified qualifier requires sm_90 or higher.

Examples

ld.global.f32    d,[a];\nld.shared.v4.b32 Q,[p];\nld.const.s32     d,[p+4];\nld.local.b32     x,[p+-8]; // negative offset\nld.local.b64     x,[240];  // immediate address\n\nld.global.b16    %r,[fs];  // load .f16 data into 32-bit reg\ncvt.f32.f16      %r,%r;    // up-convert f16 data to f32\n\nld.global.b32    %r0, [fs];     // load .f16x2 data in 32-bit reg\nld.global.b32    %r1, [fs + 4]; // load .f16x2 data in 32-bit reg\nadd.rn.f16x2     %d0, %r0, %r1; // addition of f16x2 data\nld.global.relaxed.gpu.u32 %r0, [gbl];\nld.shared.acquire.gpu.u32 %r1, [sh];\nld.global.relaxed.cluster.u32 %r2, [gbl];\nld.shared::cta.acquire.gpu.u32 %r2, [sh + 4];\nld.shared::cluster.u32 %r3, [sh + 8];\n\nld.global.f32    d,[ugbl].unified;\nld.b32           %r0, [%r1].unified;\n\nld.global.L1::evict_last.u32  d, [p];\n\nld.global.L2::64B.b32   %r0, [gbl]; // Prefetch 64B to L2\nld.L2::128B.f64         %r1, [gbl]; // Prefetch 128B to L2\nld.global.L2::256B.f64  %r2, [gbl]; // Prefetch 256B to L2\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache-policy, 1;\nld.global.L2::cache_hint.b64  x, [p], cache-policy;\n

Data Movement and Conversion Instructions: ld.global.nc

\n\n\n

Load a register variable from global state space via non-coherent cache.

Syntax

ld.global{.cop}.nc{.level::cache_hint}.type                 d, [a]{, cache-policy};\nld.global{.cop}.nc{.level::cache_hint}.vec.type             d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint}.type      d, [a]{, cache-policy};\nld.global.nc{.level::eviction_priority}{.level::cache_hint}.vec.type  d, [a]{, cache-policy};\n\n.cop  =                     { .ca, .cg, .cs };     // cache operation\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate};\n.level::cache_hint =        { .L2::cache_hint };\n.vec  =                     { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n

Description

Load register variable d from the location specified by the source address operand a in the\nglobal state space, and optionally cache in non-coherent read-only cache.

Note

On some architectures, the texture cache is larger, has higher bandwidth, and longer latency than\nthe global memory cache. For applications with sufficient parallelism to cover the longer\nlatency, ld.global.nc should offer better performance than ld.global on such\narchitectures.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

The qualifier .level::eviction_priority specifies the eviction policy that will be used during\nmemory access.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

Semantics

d = a;             // named variable a\nd = *(&a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n

Notes

Destination d must be in the .reg state space.

.f16 data may be loaded using ld.b16, and then converted to .f32 or .f64 using cvt.

PTX ISA Notes

Introduced in PTX ISA version 3.1.

Support for .level::eviction_priority and .level::cache_hint qualifiers introduced in PTX\nISA version 7.4.

Target ISA Notes

Requires sm_32 or higher.

Support for .level::eviction_priority qualifier requires sm_70 or higher.

Support for .level::cache_hint qualifier requires sm_80 or higher.

Examples

ld.global.nc.f32           d, [a];\nld.gloal.nc.L1::evict_last.u32 d, [a];\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.5;\nld.global.nc.L2::cache_hint.f32  d, [a], cache-policy;\n

", "tooltip": "=====Data Movement and Conversion Instructions: ld\n\n\n\nLoad a register variable from an addressable state space variable.\n\nSyntax\n\nld{.weak}{.ss}{.cop}{.level::cache_hint}{.level::prefetch_size}{.vec}.type d, [a]{.unified}{, cache-policy};\n\nld{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type d, [a]{.unified}{, cache-policy};\n\nld.volatile{.ss}{.level::prefetch_size}{.vec}.type d, [a];\n\nld.relaxed.scope{.ss}{.le...\n\n=====Data Movement and Conversion Instructions: ld.global.nc\n\n\n\nLoad a register variable from global state space via non-coherent cache.\n\nSyntax\n\nld.global{.cop}.nc{.level::cache_hint}.type d, [a]{, cache-policy};\n\nld.global{.cop}.nc{.level::cache_hint}.vec.type d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint}.type d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld" }; case "ldu": return { "html": "For more information, visit ldu .

Data Movement and Conversion Instructions: ldu

\n\n\n

Load read-only data from an address that is common across threads in the warp.

Syntax

ldu{.ss}.type      d, [a];       // load from address\nldu{.ss}.vec.type  d, [a];       // vec load from address\n\n.ss   = { .global };             // state space\n.vec  = { .v2, .v4 };\n.type = { .b8, .b16, .b32, .b64,\n           .u8, .u16, .u32, .u64,\n           .s8, .s16, .s32, .s64,\n                      .f32, .f64 };\n

Description

Load read-only data into register variable d from the location specified by the source address\noperand a in the global state space, where the address is guaranteed to be the same across all\nthreads in the warp. If no state space is given, perform the load using Generic Addressing.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

Semantics

d = a;             // named variable a\nd = *(&a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n

Notes

Destination d must be in the .reg state space.

.f16 data may be loaded using ldu.b16, and then converted to .f32 or .f64 using\ncvtor can be used in half precision floating point instructions.

.f16x2 data may be loaded using ldu.b32 and then used in half precision floating point\ninstructions.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

ldu.f64 requires sm_13 or higher.

Examples

ldu.global.f32    d,[a];\nldu.global.b32    d,[p+4];\nldu.global.v4.f32 Q,[p];\n

", "tooltip": "Load read-only data from an address that is common across threads in the warp.\n\nSyntax\n\nldu{.ss}.type d, [a]; // load from address\n\nldu{.ss}.vec.type d, [a]; // vec load from address\n\n.ss = { .global }; // state space\n\n.vec = { .v2, .v4 };\n\n.type = { .b8, .b16, .b32, .b64,\n\n .u8, .u16, .u32, .u64,\n\n .s8, .s16, .s32, .s64,\n\n .f32, .f64 };\n\nDescription\n\nLoad read-only data into register variable d from the location specified by the source address\n\noperand a in the global state space, where the address is guaranteed to be the same across all\n\nthreads in the warp. If no state space is given, perform the load using Generic Addressing.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nSemantics\n\nd = a; // named variable a\n\nd = *(&a+immOff) // variable-plus-offset\n\nd = *a; // register\n\nd = *(a+immOff); // register-plus-offset\n\nd = *(immAddr); // immediate address\n\nNotes\n\nDestination d must be in the .reg state space.\n\nA destination register wider than the specified type may be used. The value loaded is sign-extended\n\nto the destination register width for signed integers, and is zero-extended to the destination\n\nregister width for unsigned and bit-size types. See\n\nTable 25\n\nfor a description of these relaxed type-checking rules.\n\n.f16 data may be loaded using ldu.b16, and then converted to .f32 or .f64 using\n\ncvtor can be used in half precision floating point instructions.\n\n.f16x2 data may be loaded using ldu.b32 and then used in half precision floating point\n\ninstructions.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nldu.f64 requires sm_13 or higher.\n\nExamples\n\nldu.global.f32 d,[a];\n\nldu.global.b32 d,[p+4];\n\nldu.global.v4.f32 Q,[p];\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu" }; case "lg2": return { "html": "For more information, visit lg2(fp) .

Floating Point Instructions: lg2

\n\n\n

Find the base-2 logarithm of a value.

Syntax

lg2.approx{.ftz}.f32  d, a;\n

Description

Determine the log₂ of a.

Semantics

d = log(a) / log(2);\n

Notes

lg2.approx.f32 implements a fast approximation to log₂(a).

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	NaN
-subnormal	-Inf
-0.0	-Inf
+0.0	-Inf
+subnormal	-Inf
+Inf	+Inf
NaN	NaN

The maximum absolute error is 2^-22.6 for mantissa.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

lg2.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

Subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

lg2.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\nintroduced in PTX ISA version 1.4.

For PTX ISA version 1.4 and later, the .approx modifier is required.

For PTX ISA versions 1.0 through 1.3, lg2.f32 defaults to lg2.approx.ftz.f32.

Target ISA Notes

Supported on all target architectures.

Examples

lg2.approx.ftz.f32  la, a;\n

", "tooltip": "Find the base-2 logarithm of a value.\n\nSyntax\n\nlg2.approx{.ftz}.f32 d, a;\n\nDescription\n\nDetermine the log2 of a.\n\nSemantics\n\nd = log(a) / log(2);\n\nNotes\n\nlg2.approx.f32 implements a fast approximation to log2(a).\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-subnormal\n\n-Inf\n\n-0.0\n\n-Inf\n\n+0.0\n\n-Inf\n\n+subnormal\n\n-Inf\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-22.6 for mantissa.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nlg2.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nlg2.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, lg2.f32 defaults to lg2.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nlg2.approx.ftz.f32 la, a;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2" }; case "loc": return { "html": "For more information, visit loc .

Debugging Directives: .loc

\n\n\n

Source file location.

Syntax

.loc file_index line_number column_position\n.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2\n

Description

Declares the source file location (source file, line number, and column position) to be associated\nwith lexically subsequent PTX instructions. .loc refers to file_index which is defined by a\n.file directive.

To indicate PTX instructions that are generated from a function that got inlined, additional\nattribute .inlined_at can be specified as part of the .loc directive. .inlined_at\nattribute specifies source location at which the specified function is inlined. file_index2,\nline_number2, and column_position2 specify the location at which function is inlined. Source\nlocation specified as part of .inlined_at directive must lexically precede as source location in\n.loc directive.

The function_name attribute specifies an offset in the DWARF section named\n.debug_str. Offset is specified as label expression or label + immediate expression\nwhere label is defined in .debug_str section. DWARF section .debug_str contains ASCII\nnull-terminated strings that specify the name of the function that is inlined.

Note that a PTX instruction may have a single associated source location, determined by the nearest\nlexically preceding .loc directive, or no associated source location if there is no preceding .loc\ndirective. Labels in PTX inherit the location of the closest lexically following instruction. A\nlabel with no following PTX instruction has no associated source location.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

function_name and inlined_at attributes are introduced in PTX ISA version 7.2.

Target ISA Notes

Supported on all target architectures.

Examples

    .loc 2 4237 0\nL1:                        // line 4237, col 0 of file #2,\n                           // inherited from mov\n    mov.u32  %r1,%r2;      // line 4237, col 0 of file #2\n    add.u32  %r2,%r1,%r3;  // line 4237, col 0 of file #2\n...\nL2:                        // line 4239, col 5 of file #2,\n                           // inherited from sub\n    .loc 2 4239 5\n    sub.u32  %r2,%r1,%r3;  // line 4239, col 5 of file #2\n    .loc 1 21 3\n    .loc 1 9 3, function_name info_string0, inlined_at 1 21 3\n    ld.global.u32   %r1, [gg]; // Function at line 9\n    setp.lt.s32 %p1, %r1, 8;   // inlined at line 21\n    .loc 1 27 3\n    .loc 1 10 5, function_name info_string1, inlined_at 1 27 3\n    .loc 1 15 3, function_name .debug_str+16, inlined_at 1 10 5\n    setp.ne.s32 %p2, %r1, 18;\n    @%p2 bra    BB2_3;\n\n    .section .debug_str {\n    info_string0:\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 102 // f\n     .b8 111 // o\n     .b8 111 // o\n     .b8 118 // v\n     .b8 0\n\n    info_string1:\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 98  // b\n     .b8 97  // a\n     .b8 114 // r\n     .b8 118 // v\n     .b8 0\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 99  // c\n     .b8 97  // a\n     .b8 114 // r\n     .b8 118 // v\n     .b8 0\n    }\n

", "tooltip": "Source file location.\n\nSyntax\n\n.loc file_index line_number column_position\n\n.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2\n\nDescription\n\nDeclares the source file location (source file, line number, and column position) to be associated\n\nwith lexically subsequent PTX instructions. .loc refers to file_index which is defined by a\n\n.file directive.\n\nTo indicate PTX instructions that are generated from a function that got inlined, additional\n\nattribute .inlined_at can be specified as part of the .loc directive. .inlined_at\n\nattribute specifies source location at which the specified function is inlined. file_index2,\n\nline_number2, and column_position2 specify the location at which function is inlined. Source\n\nlocation specified as part of .inlined_at directive must lexically precede as source location in\n\n.loc directive.\n\nThe function_name attribute specifies an offset in the DWARF section named\n\n.debug_str. Offset is specified as label expression or label + immediate expression\n\nwhere label is defined in .debug_str section. DWARF section .debug_str contains ASCII\n\nnull-terminated strings that specify the name of the function that is inlined.\n\nNote that a PTX instruction may have a single associated source location, determined by the nearest\n\nlexically preceding .loc directive, or no associated source location if there is no preceding .loc\n\ndirective. Labels in PTX inherit the location of the closest lexically following instruction. A\n\nlabel with no following PTX instruction has no associated source location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nfunction_name and inlined_at attributes are introduced in PTX ISA version 7.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n .loc 2 4237 0\n\nL1: // line 4237, col 0 of file #2,\n\n // inherited from mov\n\n mov.u32 %r1,%r2; // line 4237, col 0 of file #2\n\n add.u32 %r2,%r1,%r3; // line 4237, col 0 of file #2\n\n...\n\nL2: // line 4239, col 5 of file #2,\n\n // inherited from sub\n\n .loc 2 4239 5\n\n sub.u32 %r2,%r1,%r3; // line 4239, col 5 of file #2\n\n .loc 1 21 3\n\n .loc 1 9 3, function_name info_string0, inlined_at 1 21 3\n\n ld.global.u32 %r1, [gg]; // Function at line 9\n\n setp.lt.s32 %p1, %r1, 8; // inlined at line 21\n\n .loc 1 27 3\n\n .loc 1 10 5, function_name info_string1, inlined_at 1 27 3\n\n .loc 1 15 3, function_name .debug_str+16, inlined_at 1 10 5\n\n setp.ne.s32 %p2, %r1, 18;\n\n @%p2 bra BB2_3;\n\n .section .debug_str {\n\n info_string0:\n\n .b8 95 // _\n\n .b8 90 // z\n\n .b8 51 // 3\n\n .b8 102 // f\n\n .b8 111 // o\n\n .b8 111 // o\n\n .b8 118 // v\n\n .b8 0\n\n info_string1:\n\n .b8 95 // _\n\n .b8 90 // z\n\n .b8 51 // 3\n\n .b8 98 // b\n\n .b8 97 // a\n\n .b8 114 // r\n\n .b8 118 // v\n\n .b8 0\n\n .b8 95 // _\n\n .b8 90 // z\n\n .b8 51 // 3\n\n .b8 99 // c\n\n .b8 97 // a\n\n .b8 114 // r\n\n .b8 118 // v\n\n .b8 0\n\n }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc" }; case "lop3": return { "html": "For more information, visit lop3 .

Logic and Shift Instructions: lop3

\n\n\n

Arbitrary logical operation on 3 inputs.

Syntax

lop3.b32 d, a, b, c, immLut;\n

Description

Compute bitwise logical operation on inputs a, b, c and store the result in destination\nd.

The logical operation is defined by a look-up table which, for 3 inputs, can be represented as an\n8-bit value specified by operand immLut as described below. immLut is an integer constant\nthat can take values from 0 to 255, thereby allowing up to 256 distinct logical operations on inputs\na, b, c.

For a logical operation F(a, b, c) the value of immLut can be computed by applying the same\noperation to three predefined constant values as follows:

ta = 0xF0;\ntb = 0xCC;\ntc = 0xAA;\n\nimmLut = F(ta, tb, tc);\n

Examples:

If F = (a & b & c);\nimmLut = 0xF0 & 0xCC & 0xAA = 0x80\n\nIf F = (a | b | c);\nimmLut = 0xF0 | 0xCC | 0xAA = 0xFE\n\nIf F = (a & b & ~c);\nimmLut = 0xF0 & 0xCC & (~0xAA) = 0x40\n\nIf F = ((a & b | c) ^ a);\nimmLut = (0xF0 & 0xCC | 0xAA) ^ 0xF0 = 0x1A\n

The following table illustrates computation of immLut for various logical operations:

ta	tb	tc	Oper 0 (False)	Oper 1 (ta & tb & tc)	Oper 2 (ta & tb & ~tc)	\u2026	Oper 254 (ta \| tb \| tc)	Oper 255 (True)
0	0	0	0	0	0	\u2026	0	1
0	0	1	0	0	0	1	1
0	1	0	0	0	0	1	1
0	1	1	0	0	0	1	1
1	0	0	0	0	0	1	1
1	0	1	0	0	0	1	1
1	1	0	0	0	1	1	1
1	1	1	0	1	0	1	1
immLut	0x0	0x80	0x40	\u2026	0xFE	0xFF

Semantics

F = GetFunctionFromTable(immLut); // returns the function corresponding to immLut value\nd = F(a, b, c);\n

PTX ISA Notes

Introduced in PTX ISA version 4.3.

Target ISA Notes

Requires sm_50 or higher.

Examples

lop3.b32  d, a, b, c, 0x40;\n

", "tooltip": "Arbitrary logical operation on 3 inputs.\n\nSyntax\n\nlop3.b32 d, a, b, c, immLut;\n\nDescription\n\nCompute bitwise logical operation on inputs a, b, c and store the result in destination\n\nd.\n\nThe logical operation is defined by a look-up table which, for 3 inputs, can be represented as an\n\n8-bit value specified by operand immLut as described below. immLut is an integer constant\n\nthat can take values from 0 to 255, thereby allowing up to 256 distinct logical operations on inputs\n\na, b, c.\n\nFor a logical operation F(a, b, c) the value of immLut can be computed by applying the same\n\noperation to three predefined constant values as follows:\n\nta = 0xF0;\n\ntb = 0xCC;\n\ntc = 0xAA;\n\nimmLut = F(ta, tb, tc);\n\nExamples:\n\nIf F = (a & b & c);\n\nimmLut = 0xF0 & 0xCC & 0xAA = 0x80\n\nIf F = (a | b | c);\n\nimmLut = 0xF0 | 0xCC | 0xAA = 0xFE\n\nIf F = (a & b & ~c);\n\nimmLut = 0xF0 & 0xCC & (~0xAA) = 0x40\n\nIf F = ((a & b | c) ^ a);\n\nimmLut = (0xF0 & 0xCC | 0xAA) ^ 0xF0 = 0x1A\n\nThe following table illustrates computation of immLut for various logical operations:\n\n\n\n\n\n\n\nta\n\ntb\n\ntc\n\nOper 0 (False)\n\nOper 1 (ta & tb & tc)\n\nOper 2 (ta & tb & ~tc)\n\n\u2026\n\nOper 254 (ta | tb | tc)\n\nOper 255 (True)\n\n0\n\n0\n\n0\n\n0\n\n0\n\n0\n\n\u2026\n\n0\n\n1\n\n0\n\n0\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n0\n\n1\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n0\n\n1\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n0\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n1\n\n1\n\n0\n\n1\n\n0\n\n1\n\n1\n\nimmLut\n\n0x0\n\n0x80\n\n0x40\n\n\u2026\n\n0xFE\n\n0xFF\n\nSemantics\n\nF = GetFunctionFromTable(immLut); // returns the function corresponding to immLut value\n\nd = F(a, b, c);\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nRequires sm_50 or higher.\n\nExamples\n\nlop3.b32 d, a, b, c, 0x40;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3" }; case "mad": return { "html": "For more information, visit mad(fp) , mad(int) , mad.cc .

Floating Point Instructions: mad

\n\n\n

Multiply two values and add a third value.

Syntax

mad{.ftz}{.sat}.f32      d, a, b, c;    // .target sm_1x\nmad.rnd{.ftz}{.sat}.f32  d, a, b, c;    // .target sm_20\nmad.rnd.f64              d, a, b, c;    // .target sm_13 and higher\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Multiplies two values and adds a third, and then writes the resulting value into a destination\nregister.

Semantics

d = a*b + c;\n

Notes

For .target sm_20 and higher:

mad.f32 computes the product of a and b to infinite precision and then adds c to\nthis product, again in infinite precision. The resulting value is then rounded to single precision\nusing the rounding mode specified by .rnd.
mad.f64 computes the product of a and b to infinite precision and then adds c to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by .rnd.
mad.{f32,f64} is the same as fma.{f32,f64}.

For .target sm_1x:

mad.f32 computes the product of a and b at double precision, and then the mantissa is\ntruncated to 23 bits, but the exponent is preserved. Note that this is different from computing\nthe product with mul, where the mantissa can be rounded and the exponent will be clamped. The\nexception for mad.f32 is when c = +/-0.0, mad.f32 is identical to the result computed\nusing separate mul and add instructions. When JIT-compiled for SM 2.0 devices, mad.f32 is\nimplemented as a fused multiply-add (i.e., fma.rn.ftz.f32). In this case, mad.f32 can\nproduce slightly different numeric results and backward compatibility is not guaranteed in this\ncase.
mad.f64 computes the product of a and b to infinite precision and then adds c to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by .rnd. Unlike mad.f32, the treatment of subnormal\ninputs and output follows IEEE 754 standard.
mad.f64 is the same as fma.f64.

Rounding modifiers (no default):

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

mad.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

mad.f64 supports subnormal numbers.

mad.f32 flushes subnormal inputs and results to sign-preserving zero.

Saturation modifier:

mad.sat.f32 clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

In PTX ISA versions 1.4 and later, a rounding modifier is required for mad.f64.

Legacy mad.f64 instructions having no rounding modifier will map to mad.rn.f64.

In PTX ISA versions 2.0 and later, a rounding modifier is required for mad.f32 for sm_20 and higher targets.

Errata

mad.f32 requires a rounding modifier for sm_20 and higher targets. However for PTX ISA\nversion 3.0 and earlier, ptxas does not enforce this requirement and mad.f32 silently defaults\nto mad.rn.f32. For PTX ISA version 3.1, ptxas generates a warning and defaults to\nmad.rn.f32, and in subsequent releases ptxas will enforce the requirement for PTX ISA version\n3.2 and later.

Target ISA Notes

mad.f32 supported on all target architectures.

mad.f64 requires sm_13 or higher.

Rounding modifiers have the following target requirements:

.rn,.rz,.rm,.rp for mad.f64, requires sm_13 or higher.
.rn,.rz,.rm,.rp for mad.f32, requires sm_20 or higher.

Examples

@p  mad.f32  d,a,b,c;\n

Integer Arithmetic Instructions: mad

\n\n\n

Multiply two values, optionally extract the high or low half of the intermediate result, and add a third value.

Syntax

mad.mode.type  d, a, b, c;\nmad.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo, .wide };\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Multiplies two values, optionally extracts the high or low half of the intermediate result, and adds\na third value. Writes the result into a destination register.

Semantics

t = a * b;\nn = bitwidth of type;\nd = t + c;           // for .wide\nd = t<2n-1..n> + c;  // for .hi variant\nd = t<n-1..0> + c;   // for .lo variant\n

Notes

The type of the operation represents the types of the a and b operands. If .hi or .lo is\nspecified, then d and c are the same size as a and b, and either the upper or lower\nhalf of the result is written to the destination register. If .wide is specified, then d and\nc are twice as wide as a and b to receive the result of the multiplication.

The .wide suffix is supported only for 16-bit and 32-bit integer types.

Saturation modifier:

.sat

limits result to MININT..MAXINT (no overflow) for the size of the operation.

Applies only to .s32 type in .hi mode.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

@p  mad.lo.s32 d,a,b,c;\n    mad.lo.s32 r,p,q,r;\n

Extended-Precision Arithmetic Instructions: mad.cc

\n\n\n

Multiply two values, extract high or low half of result, and add a third value with carry-out.

Syntax

mad{.hi,.lo}.cc.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n

Description

Multiplies two values, extracts either the high or low part of the result, and adds a third\nvalue. Writes the result to the destination register and the carry-out from the addition into the\ncondition code register.

Semantics

t = a * b;\nd = t<63..32> + c;    // for .hi variant\nd = t<31..0> + c;     // for .lo variant\n

carry-out from addition is written to CC.CF

Notes

Generally used in combination with madc and addc to implement extended-precision multi-word\nmultiplication. See madc for an example.

PTX ISA Notes

32-bit mad.cc introduced in PTX ISA version 3.0.

64-bit mad.cc introduced in PTX ISA version 4.3.

Target ISA Notes

Requires target sm_20 or higher.

Examples

@p  mad.lo.cc.u32 d,a,b,c;\n    mad.lo.cc.u32 r,p,q,r;\n

", "tooltip": "=====Floating Point Instructions: mad\n\n\n\nMultiply two values and add a third value.\n\nSyntax\n\nmad{.ftz}{.sat}.f32 d, a, b, c; // .target sm_1x\n\nmad.rnd{.ftz}{.sat}.f32 d, a, b, c; // .target sm_20\n\nmad.rnd.f64 d, a, b, c; // .target sm_13 and higher\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nMultiplies two values and adds a third, and then writes the resulting value into a destination\n\nregister.\n\nSemantics\n\nd = a*b + ...\n\n=====Integer Arithmetic Instructions: mad\n\n\n\nMultiply two values, optionally extract the high or low half of the intermediate result, and add a third value.\n\nSyntax\n\nmad.mode.type d, a, b, c;\n\nmad.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo, .wide };\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nMultiplies two values, optionally extracts the high or low half of the intermediate result, and adds\n\na third value. Writes the r...\n\n=====Extended-Precision Arithmetic Instructions: mad.cc\n\n\n\nMultiply two values, extract high or low half of result, and add a third value with carry-out.\n\nSyntax\n\nmad{.hi,.lo}.cc.type d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nMultiplies two values, extracts either the high or low part of the result, and adds a third\n\nvalue. Writes the result to the destination register and the carry-out from the addition into the\n\ncondition code register.\n\nS... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad" }; case "mad24": return { "html": "For more information, visit mad24(int) .

Integer Arithmetic Instructions: mad24

\n\n\n

Multiply two 24-bit integer values and add a third value.

Syntax

mad24.mode.type  d, a, b, c;\nmad24.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo };\n.type = { .u32, .s32 };\n

Description

Compute the product of two 24-bit integer values held in 32-bit source registers, and add a third,\n32-bit value to either the high or low 32-bits of the 48-bit result. Return either the high or low\n32-bits of the 48-bit result.

Semantics

t = a * b;\nd = t<47..16> + c;   // for .hi variant\nd = t<31..0> + c;    // for .lo variant\n

Notes

Integer multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.

mad24.hi performs a 24x24-bit multiply and adds the high 32 bits of the 48-bit result to a third\nvalue.

mad24.lo performs a 24x24-bit multiply and adds the low 32 bits of the 48-bit result to a third\nvalue.

All operands are of the same type and size.

Saturation modifier:

.sat: limits result of 32-bit signed addition to MININT..MAXINT (no overflow). Applies only to\n.s32 type in .hi mode.
\n

mad24.hi may be less efficient on machines without hardware support for 24-bit multiply.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

mad24.lo.s32 d,a,b,c;   // low 32-bits of 24x24-bit signed multiply.\n

", "tooltip": "Multiply two 24-bit integer values and add a third value.\n\nSyntax\n\nmad24.mode.type d, a, b, c;\n\nmad24.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo };\n\n.type = { .u32, .s32 };\n\nDescription\n\nCompute the product of two 24-bit integer values held in 32-bit source registers, and add a third,\n\n32-bit value to either the high or low 32-bits of the 48-bit result. Return either the high or low\n\n32-bits of the 48-bit result.\n\nSemantics\n\nt = a * b;\n\nd = t<47..16> + c; // for .hi variant\n\nd = t<31..0> + c; // for .lo variant\n\nNotes\n\nInteger multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.\n\nmad24.hi performs a 24x24-bit multiply and adds the high 32 bits of the 48-bit result to a third\n\nvalue.\n\nmad24.lo performs a 24x24-bit multiply and adds the low 32 bits of the 48-bit result to a third\n\nvalue.\n\nAll operands are of the same type and size.\n\nSaturation modifier:\n\n.satlimits result of 32-bit signed addition to MININT..MAXINT (no overflow). Applies only to\n\n.s32 type in .hi mode.\n\nmad24.hi may be less efficient on machines without hardware support for 24-bit multiply.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmad24.lo.s32 d,a,b,c; // low 32-bits of 24x24-bit signed multiply.\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad24" }; case "madc": return { "html": "For more information, visit madc .

Extended-Precision Arithmetic Instructions: madc

\n\n\n

Multiply two values, extract high or low half of result, and add a third value with carry-in and\noptional carry-out.

Syntax

madc{.hi,.lo}{.cc}.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n

Description

Multiplies two values, extracts either the high or low part of the result, and adds a third value\nalong with carry-in. Writes the result to the destination register and optionally writes the\ncarry-out from the addition into the condition code register.

Semantics

t = a * b;\nd = t<63..32> + c + CC.CF;     // for .hi variant\nd = t<31..0> + c + CC.CF;      // for .lo variant\n

if .cc specified, carry-out from addition is written to CC.CF

Notes

Generally used in combination with mad.cc and addc to implement extended-precision\nmulti-word multiplication. See example below.

PTX ISA Notes

32-bit madc introduced in PTX ISA version 3.0.

64-bit madc introduced in PTX ISA version 4.3.

Target ISA Notes

Requires target sm_20 or higher.

Examples

// extended-precision multiply:  [r3,r2,r1,r0] = [r5,r4] * [r7,r6]\nmul.lo.u32     r0,r4,r6;      // r0=(r4*r6).[31:0], no carry-out\nmul.hi.u32     r1,r4,r6;      // r1=(r4*r6).[63:32], no carry-out\nmad.lo.cc.u32  r1,r5,r6,r1;   // r1+=(r5*r6).[31:0], may carry-out\nmadc.hi.u32    r2,r5,r6,0;    // r2 =(r5*r6).[63:32]+carry-in,\n                              // no carry-out\nmad.lo.cc.u32   r1,r4,r7,r1;  // r1+=(r4*r7).[31:0], may carry-out\nmadc.hi.cc.u32  r2,r4,r7,r2;  // r2+=(r4*r7).[63:32]+carry-in,\n                              // may carry-out\naddc.u32        r3,0,0;       // r3 = carry-in, no carry-out\nmad.lo.cc.u32   r2,r5,r7,r2;  // r2+=(r5*r7).[31:0], may carry-out\nmadc.hi.u32     r3,r5,r7,r3;  // r3+=(r5*r7).[63:32]+carry-in\n

", "tooltip": "Multiply two values, extract high or low half of result, and add a third value with carry-in and\n\noptional carry-out.\n\nSyntax\n\nmadc{.hi,.lo}{.cc}.type d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nMultiplies two values, extracts either the high or low part of the result, and adds a third value\n\nalong with carry-in. Writes the result to the destination register and optionally writes the\n\ncarry-out from the addition into the condition code register.\n\nSemantics\n\nt = a * b;\n\nd = t<63..32> + c + CC.CF; // for .hi variant\n\nd = t<31..0> + c + CC.CF; // for .lo variant\n\nif .cc specified, carry-out from addition is written to CC.CF\n\nNotes\n\nGenerally used in combination with mad.cc and addc to implement extended-precision\n\nmulti-word multiplication. See example below.\n\nPTX ISA Notes\n\n32-bit madc introduced in PTX ISA version 3.0.\n\n64-bit madc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nRequires target sm_20 or higher.\n\nExamples\n\n// extended-precision multiply: [r3,r2,r1,r0] = [r5,r4] * [r7,r6]\n\nmul.lo.u32 r0,r4,r6; // r0=(r4*r6).[31:0], no carry-out\n\nmul.hi.u32 r1,r4,r6; // r1=(r4*r6).[63:32], no carry-out\n\nmad.lo.cc.u32 r1,r5,r6,r1; // r1+=(r5*r6).[31:0], may carry-out\n\nmadc.hi.u32 r2,r5,r6,0; // r2 =(r5*r6).[63:32]+carry-in,\n\n // no carry-out\n\nmad.lo.cc.u32 r1,r4,r7,r1; // r1+=(r4*r7).[31:0], may carry-out\n\nmadc.hi.cc.u32 r2,r4,r7,r2; // r2+=(r4*r7).[63:32]+carry-in,\n\n // may carry-out\n\naddc.u32 r3,0,0; // r3 = carry-in, no carry-out\n\nmad.lo.cc.u32 r2,r5,r7,r2; // r2+=(r5*r7).[31:0], may carry-out\n\nmadc.hi.u32 r3,r5,r7,r3; // r3+=(r5*r7).[63:32]+carry-in\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc" }; case "mapa": return { "html": "For more information, visit mapa .

Data Movement and Conversion Instructions: mapa

\n\n\n

Map the address of the shared variable in the target CTA.

Syntax

mapa{.space}.type          d, a, b;\n\n// Maps shared memory address in register a into CTA b.\nmapa.shared::cluster.type  d, a, b;\n\n// Maps shared memory variable into CTA b.\nmaps.shared::cluster.type  d, sh, b;\n\n// Maps shared memory variable into CTA b.\nmaps.shared::cluster.type  d, sh + imm, b;\n\n// Maps generic address in register a into CTA b.\nmapa.type                  d, a, b;\n\n.space = { .shared::cluster }\n.type  = { .u32, .u64 }\n

Description

Get address in the CTA specified by operand b which corresponds to the address specified by\noperand a.

Instruction type .type indicates the type of the destination operand d and the source\noperand a.

When space is .shared::cluster, source a is either a shared memory variable or a register\ncontaining a valid shared memory address and register d contains a shared memory address. When\nthe optional qualifier .space is not specified, both a and d are registers containing\ngeneric addresses pointing to shared memory.

b is a 32-bit integer operand representing the rank of the target CTA.

Destination register d will hold an address in CTA b corresponding to operand a.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

mapa.shared::cluster.u64 d1, %reg1, cta;\nmapa.shared::cluster.u32 d2, sh, 3;\nmapa.u64                 d3, %reg2, cta;\n

", "tooltip": "Map the address of the shared variable in the target CTA.\n\nSyntax\n\nmapa{.space}.type d, a, b;\n\n// Maps shared memory address in register a into CTA b.\n\nmapa.shared::cluster.type d, a, b;\n\n// Maps shared memory variable into CTA b.\n\nmaps.shared::cluster.type d, sh, b;\n\n// Maps shared memory variable into CTA b.\n\nmaps.shared::cluster.type d, sh + imm, b;\n\n// Maps generic address in register a into CTA b.\n\nmapa.type d, a, b;\n\n.space = { .shared::cluster }\n\n.type = { .u32, .u64 }\n\nDescription\n\nGet address in the CTA specified by operand b which corresponds to the address specified by\n\noperand a.\n\nInstruction type .type indicates the type of the destination operand d and the source\n\noperand a.\n\nWhen space is .shared::cluster, source a is either a shared memory variable or a register\n\ncontaining a valid shared memory address and register d contains a shared memory address. When\n\nthe optional qualifier .space is not specified, both a and d are registers containing\n\ngeneric addresses pointing to shared memory.\n\nb is a 32-bit integer operand representing the rank of the target CTA.\n\nDestination register d will hold an address in CTA b corresponding to operand a.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmapa.shared::cluster.u64 d1, %reg1, cta;\n\nmapa.shared::cluster.u32 d2, sh, 3;\n\nmapa.u64 d3, %reg2, cta;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa" }; case "match": return { "html": "For more information, visit match.sync .

Parallel Synchronization and Communication Instructions: match.sync

\n\n\n

Broadcast and compare a value across threads in warp.

Syntax

match.any.sync.type  d, a, membermask;\nmatch.all.sync.type  d[|p], a, membermask;\n\n.type = { .b32, .b64 };\n

Description

match.sync will cause executing thread to wait until all non-exited threads from membermask\nhave executed match.sync with the same qualifiers and same membermask value before resuming\nexecution.

Operand membermask specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s laneid.

match.sync performs broadcast and compare of operand a across all non-exited threads in\nmembermask and sets destination d and optional predicate p based on mode.

Operand a has instruction type and d has .b32 type.

Destination d is a 32-bit mask where bit position in mask corresponds to thread\u2019s laneid.

The matching operation modes are:

.all: d is set to mask corresponding to non-exited threads in membermask if all non-exited\nthreads in membermask have same value of operand a; otherwise d is set\nto 0. Optionally predicate p is set to true if all non-exited threads in membermask have\nsame value of operand a; otherwise p is set to false. The sink symbol \u2018_\u2019 may be used in\nplace of any one of the destination operands.
\n
.any: d is set to mask of non-exited threads in membermask that have same value of operand\na.
\n

The behavior of match.sync is undefined if the executing thread is not in the membermask.

PTX ISA Notes

Introduced in PTX ISA version 6.0.

Target ISA Notes

Requires sm_70 or higher.

Release Notes

Note that match.sync applies to threads in a single warp, not across an entire CTA.

Examples

match.any.sync.b32    d, a, 0xffffffff;\nmatch.all.sync.b64    d|p, a, mask;\n

", "tooltip": "Broadcast and compare a value across threads in warp.\n\nSyntax\n\nmatch.any.sync.type d, a, membermask;\n\nmatch.all.sync.type d[|p], a, membermask;\n\n.type = { .b32, .b64 };\n\nDescription\n\nmatch.sync will cause executing thread to wait until all non-exited threads from membermask\n\nhave executed match.sync with the same qualifiers and same membermask value before resuming\n\nexecution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid.\n\nmatch.sync performs broadcast and compare of operand a across all non-exited threads in\n\nmembermask and sets destination d and optional predicate p based on mode.\n\nOperand a has instruction type and d has .b32 type.\n\nDestination d is a 32-bit mask where bit position in mask corresponds to thread\u2019s laneid.\n\nThe matching operation modes are:\n\n.alld is set to mask corresponding to non-exited threads in membermask if all non-exited\n\nthreads in membermask have same value of operand a; otherwise d is set\n\nto 0. Optionally predicate p is set to true if all non-exited threads in membermask have\n\nsame value of operand a; otherwise p is set to false. The sink symbol \u2018_\u2019 may be used in\n\nplace of any one of the destination operands.\n\n.anyd is set to mask of non-exited threads in membermask that have same value of operand\n\na.\n\nThe behavior of match.sync is undefined if the executing thread is not in the membermask.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_70 or higher.\n\nRelease Notes\n\nNote that match.sync applies to threads in a single warp, not across an entire CTA.\n\nExamples\n\nmatch.any.sync.b32 d, a, 0xffffffff;\n\nmatch.all.sync.b64 d|p, a, mask;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync" }; case "max": return { "html": "For more information, visit max(fp) , max(fp16) , max(int) .

Floating Point Instructions: max

\n\n\n

Find the maximum of two values.

Syntax

max{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\nmax.f64                            d, a, b;\n

Description

Store the maximum of a and b in d.

If .NaN modifier is specified, the result is canonical NaN if either of the inputs is\nNaN.

If .abs modifier is specified, the magnitude of destination operand d is the maximum of\nabsolute values of both the input arguments.

If .xorsign modifier is specified, the sign bit of destination d is equal to the XOR of the\nsign bits of both the inputs.

Modifiers .abs and .xorsign must be specified together and .xorsign considers the sign\nbit of both inputs before applying .abs operation.

If the result of max is NaN then the .xorsign and .abs modifiers will be ignored.

Semantics

if (.xorsign) {\n    xorsign = getSignBit(a) ^ getSignBit(b);\n    if (.abs) {\n        a = |a|;\n        b = |b|;\n    }\n}\nif (isNaN(a) && isNaN(b))                 d = NaN;\nelse if (.NaN && (isNaN(a) || isNaN(b)))  d = NaN;\nelse if (isNaN(a))                        d = b;\nelse if (isNaN(b))                        d = a;\nelse                                      d = (a > b) ? a : b;\nif (.xorsign && !isNaN(d)) {\n    setSignBit(d, xorsign);\n}\n

Notes

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

max.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

max.f64 supports subnormal numbers.

max.f32 flushes subnormal inputs and results to sign-preserving zero.

If values of both inputs are 0.0, then +0.0 > -0.0.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

max.NaNintroduced in PTX ISA version 7.0.

max.xorsign.abs introduced in PTX ISA version 7.2.

Target ISA Notes

max.f32 supported on all target architectures.

max.f64 requires sm_13 or higher.

max.NaNrequires sm_80 or higher.

max.xorsign.abs requires sm_86 or higher.

Examples

max.ftz.f32  f0,f1,f2;\nmax.f64      a,b,c;\n// fp32 max with .NaN\nmax.NaN.f32  f0,f1,f2;\n// fp32 max with .xorsign.abs\nmax.xorsign.abs.f32 Rd, Ra, Rb;\n

Half Precision Floating Point Instructions: max

\n\n\n

Find the maximum of two values.

Syntax

max{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\nmax{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\nmax{.NaN}{.xorsign.abs}.bf16           d, a, b;\nmax{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n

Description

Store the maximum of a and b in d.

For .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\nfrom source operands. Half-word operands are then processed in parallel to store .f16x2 or\n.bf16x2 result in destination.

If .NaN modifier is specified, the result is canonical NaN if either of the inputs is\nNaN.

If .abs modifier is specified, the magnitude of destination operand d is the maximum of\nabsolute values of both the input arguments.

If .xorsign modifier is specified, the sign bit of destination d is equal to the XOR of the\nsign bits of both the inputs.

Modifiers .abs and .xorsign must be specified together and .xorsign considers the sign\nbit of both inputs before applying .abs operation.

If the result of max is NaN then the .xorsign and .abs modifiers will be ignored.

Semantics

if (type == f16 || type == bf16) {\n    if (.xorsign) {\n        xorsign = getSignBit(a) ^ getSignBit(b);\n        if (.abs) {\n            a = |a|;\n            b = |b|;\n        }\n    }\n    if (isNaN(a) && isNaN(b))              d = NaN;\n    if (.NaN && (isNaN(a) || isNaN(b)))    d = NaN;\n    else if (isNaN(a))                     d = b;\n    else if (isNaN(b))                     d = a;\n    else                                   d = (a > b) ? a : b;\n    if (.xorsign && !isNaN(d)) {\n         setSignBit(d, xorsign);\n    }\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n        if (.xorsign) {\n            xorsign = getSignBit(fA[i]) ^ getSignBit(fB[i]);\n            if (.abs) {\n                fA[i] = |fA[i]|;\n                fB[i] = |fB[i]|;\n            }\n        }\n        if (isNaN(fA[i]) && isNaN(fB[i]))              d[i] = NaN;\n        if (.NaN && (isNaN(fA[i]) || isNaN(fB[i])))    d[i] = NaN;\n        else if (isNaN(fA[i]))                         d[i] = fB[i];\n        else if (isNaN(fB[i]))                         d[i] = fA[i];\n        else                                           d[i] = (fA[i] > fB[i]) ? fA[i] : fB[i];\n        if (.xorsign && !isNaN(fA[i])) {\n            setSignBit(d[i], xorsign);\n        }\n    }\n}\n

Notes

Subnormal numbers:: By default, subnormal numbers are supported.\nmax.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n

If values of both inputs are 0.0, then +0.0 > -0.0.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

max.xorsign.abs introduced in PTX ISA version 7.2.

Target ISA Notes

Requires sm_80 or higher.

max.xorsign.abs support requires sm_86 or higher.

Examples

max.ftz.f16       h0,h1,h2;\nmax.f16x2         b0,b1,b2;\n// SIMD fp16 max with NaN\nmax.NaN.f16x2     b0,b1,b2;\n// scalar f16 max with xorsign.abs\nmax.xorsign.abs.f16 Rd, Ra, Rb;\nmax.bf16          h0, h1, h2;\n// scalar bf16 max and NaN\nmax.NaN.bf16x2    b0, b1, b2;\n// SIMD bf16 max with xorsign.abs\nmax.xorsign.abs.bf16x2 Rd, Ra, Rb;\n

Integer Arithmetic Instructions: max

\n\n\n

Find the maximum of two values.

Syntax

max.atype         d, a, b;\nmax{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n           .u16x2, .s16, .s64 };\n.btype = { .s16x2, .s32 };\n

Description

Store the maximum of a and b in d.

For .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s16x2 result\nin destination.

Operands d, a and b have the same type as the instruction type. For instruction types\n.u16x2, .s16x2, operands d, a and b have type .b32.

Semantics

if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = (iA[i] > iB[i]) ? iA[i] : iB[i];\n    }\n} else {\n    d = (a > b) ? a : b; // Integer (signed and unsigned)\n}\n

Notes

Signed and unsigned differ.

Saturation modifier:: max.relu.{s16x2, s32} clamps the result to 0 if negative.
\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

max.u16x2, max{.relu}.s16x2 and max.relu.s32 introduced in PTX ISA version 8.0.

Target ISA Notes

Supported on all target architectures.

max.u16x2, max{.relu}.s16x2 and max.relu.s32 require sm_90 or higher.

Examples

max.u32  d,a,b;\nmax.s32  q,q,0;\nmax.relu.s16x2 t,t,u;\n

", "tooltip": "=====Floating Point Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f32 d, a, b;\n\nmax.f64 d, a, b;\n\nDescription\n\nStore the maximum of a and b in d.\n\nIf .NaN modifier is specified, the result is canonical NaN if either of the inputs is\n\nNaN.\n\nIf .abs modifier is specified, the magnitude of destination operand d is the maximum of\n\nabsolute values of both the input arguments.\n\nIf...\n\n=====Half Precision Floating Point Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f16 d, a, b;\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f16x2 d, a, b;\n\nmax{.NaN}{.xorsign.abs}.bf16 d, a, b;\n\nmax{.NaN}{.xorsign.abs}.bf16x2 d, a, b;\n\nDescription\n\nStore the maximum of a and b in d.\n\nFor .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\n\nfrom source operands. Half-word o...\n\n=====Integer Arithmetic Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax.atype d, a, b;\n\nmax{.relu}.btype d, a, b;\n\n.atype = { .u16, .u32, .u64,\n\n .u16x2, .s16, .s64 };\n\n.btype = { .s16x2, .s32 };\n\nDescription\n\nStore the maximum of a and b in d.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max" }; case "maxclusterrank": return { "html": "For more information, visit maxclusterrank .

Cluster Dimension Directives: .maxclusterrank

\n\n\n

Declare the maximum number of CTAs that can be part of the cluster.

Syntax

.maxclusterrank n\n

Description

Declare the maximum number of thread blocks (CTAs) allowed to be part of the cluster.

Semantics

Product of the number of CTAs in each cluster dimension specified in any invocation of the kernel is\nrequired to be less or equal to that specified in this directive. Otherwise invocation will result\nin a runtime error or kernel launch failure.

The .maxclusterrank directive cannot be used in conjunction with the .reqnctapercluster directive.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.entry foo ..maxclusterrank 8         { . . . }\n

", "tooltip": "Declare the maximum number of CTAs that can be part of the cluster.\n\nSyntax\n\n.maxclusterrank n\n\nDescription\n\nDeclare the maximum number of thread blocks (CTAs) allowed to be part of the cluster.\n\nSemantics\n\nProduct of the number of CTAs in each cluster dimension specified in any invocation of the kernel is\n\nrequired to be less or equal to that specified in this directive. Otherwise invocation will result\n\nin a runtime error or kernel launch failure.\n\nThe .maxclusterrank directive cannot be used in conjunction with the .reqnctapercluster directive.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo ..maxclusterrank 8 { . . . }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank" }; case "maxnreg": return { "html": "For more information, visit maxnreg .

Performance-Tuning Directives: .maxnreg

\n\n\n

Maximum number of registers that can be allocated per thread.

Syntax

.maxnreg n\n

Description

Declare the maximum number of registers per thread in a CTA.

Semantics

The compiler guarantees that this limit will not be exceeded. The actual number of registers used\nmay be less; for example, the backend may be able to compile to fewer registers, or the maximum\nnumber of registers may be further constrained by .maxntid and .maxctapersm.

PTX ISA Notes

Introduced in PTX ISA version 1.3.

Target ISA Notes

Supported on all target architectures.

Examples

.entry foo .maxnreg 16 { ... }  // max regs per thread = 16\n

", "tooltip": "Maximum number of registers that can be allocated per thread.\n\nSyntax\n\n.maxnreg n\n\nDescription\n\nDeclare the maximum number of registers per thread in a CTA.\n\nSemantics\n\nThe compiler guarantees that this limit will not be exceeded. The actual number of registers used\n\nmay be less; for example, the backend may be able to compile to fewer registers, or the maximum\n\nnumber of registers may be further constrained by .maxntid and .maxctapersm.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxnreg 16 { ... } // max regs per thread = 16\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxnreg" }; case "maxntid": return { "html": "For more information, visit maxntid .

Performance-Tuning Directives: .maxntid

\n\n\n

Maximum number of threads in the thread block (CTA).

Syntax

.maxntid nx\n.maxntid nx, ny\n.maxntid nx, ny, nz\n

Description

Declare the maximum number of threads in the thread block (CTA). This maximum is specified by giving\nthe maximum extent of each dimension of the 1D, 2D, or 3D CTA.\u00a0 The maximum number of threads is the\nproduct of the maximum extent in each dimension.

Semantics

The maximum number of threads in the thread block, computed as the product of the maximum extent\nspecified for each dimension, is guaranteed not to be exceeded in any invocation of the kernel in\nwhich this directive appears. Exceeding the maximum number of threads results in a runtime error or\nkernel launch failure.

Note that this directive guarantees that the total number of threads does not exceed the maximum,\nbut does not guarantee that the limit in any particular dimension is not exceeded.

PTX ISA Notes

Introduced in PTX ISA version 1.3.

Target ISA Notes

Supported on all target architectures.

Examples

.entry foo .maxntid 256       { ... }  // max threads = 256\n.entry bar .maxntid 16,16,4   { ... }  // max threads = 1024\n

", "tooltip": "Maximum number of threads in the thread block (CTA).\n\nSyntax\n\n.maxntid nx\n\n.maxntid nx, ny\n\n.maxntid nx, ny, nz\n\nDescription\n\nDeclare the maximum number of threads in the thread block (CTA). This maximum is specified by giving\n\nthe maximum extent of each dimension of the 1D, 2D, or 3D CTA.\u00a0 The maximum number of threads is the\n\nproduct of the maximum extent in each dimension.\n\nSemantics\n\nThe maximum number of threads in the thread block, computed as the product of the maximum extent\n\nspecified for each dimension, is guaranteed not to be exceeded in any invocation of the kernel in\n\nwhich this directive appears. Exceeding the maximum number of threads results in a runtime error or\n\nkernel launch failure.\n\nNote that this directive guarantees that the total number of threads does not exceed the maximum,\n\nbut does not guarantee that the limit in any particular dimension is not exceeded.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxntid 256 { ... } // max threads = 256\n\n.entry bar .maxntid 16,16,4 { ... } // max threads = 1024\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxntid" }; case "mbarrier": return { "html": "For more information, visit mbarrier , mbarrier.arrive , mbarrier.arrive_drop , mbarrier.complete_tx , mbarrier.expect_tx , mbarrier.init , mbarrier.inval , mbarrier.pending_count , mbarrier.test_wait/mbarrier.try_wait .

Parallel Synchronization and Communication Instructions: mbarrier

\n\n\n

Synchronizing any subset of threads within a CTA
One-way synchronization of threads across CTAs of a cluster. As noted in mbarrier support with\nshared memory, threads can\nperform only arrive operations but not *_wait on an mbarrier located in shared::cluster\nspace.
Waiting for completion of asynchronous memory operations initiated by a thread and making them\nvisible to other threads.

An mbarrier object is an opaque object in memory which can be initialized and invalidated using :

mbarrier.init
mbarrier.inval

Operations supported on mbarrier objects are :

mbarrier.expect_tx
mbarrier.complete_tx
mbarrier.arrive
mbarrier.arrive_drop
mbarrier.test_wait
mbarrier.try_wait
mbarrier.pending_count
cp.async.mbarrier.arrive

Performing any mbarrier operation except mbarrier.init on an uninitialized mbarrier object\nresults in undefined behavior.

Unlike bar{.cta}/barrier{.cta} instructions which can access a limited number of barriers\nper CTA, mbarrier objects are used defined and are only limited by the total shared memory size\navailable.

mbarrier operations enable threads to perform useful work after the arrival at the mbarrier and\nbefore waiting for the mbarrier to complete.

9.7.12.15.1. Size and alignment of mbarrier object \uf0c1

An mbarrier object is an opaque object with the following type and alignment requirements :

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Type	Alignment (bytes)	Memory space
`.b64`	8	`.shared`

9.7.12.15.2. Contents of the mbarrier object \uf0c1

An opaque mbarrier object keeps track of the following information :

Current phase of the mbarrier object
Count of pending arrivals for the current phase of the mbarrier object
Count of expected arrivals for the next phase of the mbarrier object
Count of pending asynchronous memory operations (or transactions) tracked by the current phase of\nthe mbarrier object. This is also referred to as tx-count.

An mbarrier object progresses through a sequence of phases where each phase is defined by threads\nperforming an expected number of arrive-on\noperations.

The valid range of each of the counts is as shown below:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Count name	Minimum value	Maximum value
Expected arrival count	1	2²⁰ - 1
Pending arrival count	0	2²⁰ - 1
tx-count	-(2²⁰ - 1)	2²⁰ - 1

9.7.12.15.3. Lifecycle of the mbarrier object \uf0c1

The mbarrier object must be initialized prior to use.

An mbarrier object is used to synchronize threads and asynchronous memory operations.

An mbarrier object may be used to perform a sequence of such synchronizations.

An mbarrier object must be invalidated to repurpose its memory.

9.7.12.15.4. Phase of the mbarrier object \uf0c1

The phase of an mbarrier object is the number of times the mbarrier object has been used to\nsynchronize threads and cp.async\noperations. In each phase {0, 1, 2, \u2026}, threads perform in program order :

arrive-on\noperations to complete the current phase and
test_wait / try_wait operations to check for the completion of the current phase.

An mbarrier object is automatically reinitialized upon completion of the current phase for\nimmediate use in the next phase. The current phase is incomplete and all prior phases are complete.

For each phase of the mbarrier object, at least one test_wait or try_wait operation must be\nperformed which returns True for waitComplete before an arrive-on operation\nin the subsequent phase.

9.7.12.15.5. Tracking asynchronous operations by the mbarrier object \uf0c1

Starting with the Hopper architecture (sm_9x), mbarrier object supports a new count, called\ntx-count, which is used for tracking the completion of asynchronous memory operations or\ntransactions. tx-count tracks the number of asynchronous transactions, in units specified by the\nasynchronous memory operation, that are outstanding and yet to be complete.

The tx-count of an mbarrier object must be set to the total amount of asynchronous memory\noperations, in units as specified by the asynchronous operations, to be tracked by the current\nphase. Upon completion of each of the asynchronous operations, the complete-tx\noperation will be performed on the mbarrier object and thus progress the mbarrier towards the\ncompletion of the current phase.

9.7.12.15.5.1. expect-tx operation \uf0c1

The expect-tx operation, with an expectCount argument, increases the tx-count of an\nmbarrier object by the value specified by expectCount. This makes the current phase of the\nmbarrier object to expect and track the completion of additional asynchronous transactions.

9.7.12.15.5.2. complete-tx operation \uf0c1

The complete-tx operation, with an completeCount argument, on an mbarrier object consists of the following:

mbarrier signaling: Signals the completion of asynchronous transactions that were tracked by the current phase. As a\nresult of this, tx-count is decremented by completeCount.
\n
mbarrier potentially completing the current phase: If the current phase has been completed then the mbarrier transitions to the next phase. Refer to\nPhase Completion of the mbarrier object\nfor details on phase completion requirements and phase transition process.
\n

9.7.12.15.6. Phase Completion of the mbarrier object \uf0c1

The requirements for completion of the current phase are described below. Upon completion of the\ncurrent phase, the phase transitions to the subsequent phase as described below.

Current phase completion requirements

An mbarrier object completes the current phase when all of the following conditions are met:

The count of the pending arrivals has reached zero.
The tx-count has reached zero.

Phase transition

When an mbarrier object completes the current phase, the following actions are performed\natomically:

The mbarrier object transitions to the next phase.
The pending arrival count is reinitialized to the expected arrival count.

9.7.12.15.7. Arrive-on operation on mbarrier object \uf0c1

An arrive-on operation, with an optional count argument, on an mbarrier object consists of the\nfollowing 2 steps :

mbarrier signalling:
\n
Signals the arrival of the executing thread OR completion of the cp.async instruction which\nsignals the arrive-on operation initiated by the executing thread on the mbarrier object. As a\nresult of this, the pending arrival count is decremented by count. If the count argument is\nnot specified, then it defaults to 1.
\n
mbarrier potentially completing the current phase:
\n
If the current phase has been completed then the mbarrier transitions to the next phase. Refer to\nPhase Completion of the mbarrier object\nfor details on phase completion requirements and phase transition process.
\n

9.7.12.15.8. mbarrier support with shared memory \uf0c1

The following table summarizes the support of various mbarrier operations on mbarrier objects\nlocated at different shared memory locations:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

mbarrier operations	`.shared::cta`	`.shared::cluster`
`mbarrier.arrive`	Supported	Supported, cannot return result
`mbarrier.expect_tx`	Supported	Supported
`mbarrier.complete_tx`	Supported	Supported
Other mbarrier operations	Supported	Not supported

9.7.12.15.9. Parallel Synchronization and Communication Instructions: mbarrier.init \uf0c1

mbarrier.init

Initialize the mbarrier object.

Syntax

mbarrier.init{.shared{::cta}}.b64 [addr], count;\n

Description

mbarrier.init initializes the mbarrier object at the location specified by the address operand\naddr with the unsigned 32-bit integer count. The value of operand count must be in the range\nas specified in Contents of the mbarrier object.

Initialization of the mbarrier object involves :

Initializing the current phase to 0.
Initializing the expected arrival count to count.
Initializing the pending arrival count to count.
Initializing the tx-count to 0.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Examples

.shared .b64 shMem, shMem2;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n\ncvta.shared.u64          addr, shMem2;\nmbarrier.init.b64        [addr],   %r1;\nbar.cta.sync             0;\n// ... other mbarrier operations on addr\n\nmbarrier.init.shared::cta.b64 [shMem], 12;\nbar.sync                 0;\n// ... other mbarrier operations on shMem\n

9.7.12.15.10. Parallel Synchronization and Communication Instructions: mbarrier.inval \uf0c1

mbarrier.inval

Invalidates the mbarrier object.

Syntax

mbarrier.inval{.shared{::cta}}.b64 [addr];\n

Description

mbarrier.inval invalidates the mbarrier object at the location specified by the address\noperand addr.

An mbarrier object must be invalidated before using its memory location for any other purpose.

Performing any mbarrier operation except mbarrier.init on an invalidated mbarrier object\nresults in undefined behaviour.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Examples

.shared .b64 shmem;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n.reg    .pred t0;\n\n// Example 1 :\nbar.sync                      0;\n@t0 mbarrier.init.b64     [addr], %r1;\n// ... other mbarrier operations on addr\nbar.sync                      0;\n@t0 mbarrier.inval.b64    [addr];\n\n\n// Example 2 :\nbar.cta.sync                  0;\nmbarrier.init.shared.b64           [shmem], 12;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared.b64      [shmem];\n\n// shmem can be reused here for unrelated use :\nbar.cta.sync                  0;\nst.shared.b64                      [shmem], ...;\n\n// shmem can be re-initialized as mbarrier object :\nbar.cta.sync                  0;\n@t0 mbarrier.init.shared.b64       [shmem], 24;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared::cta.b64 [shmem];\n

9.7.12.15.11. Parallel Synchronization and Communication Instructions: mbarrier.expect_tx \uf0c1

mbarrier.expect_tx

Perfoms expect-tx operation on the mbarrier object.

Syntax

mbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n

Description

A thread executing mbarrier.expect_tx performs an expect-tx\noperation on the mbarrier object at the location specified by the address operand addr. The\n32-bit unsigned integer operand txCount specifies the expectCount argument to the\nexpect-tx operation.

If no state space is specified then Generic Addressing is\nused. If the address specified by addr does not fall within the address window of\n.shared::cta or .shared::cluster state space then the behavior is undefined.

Supported addressing modes for operand addr are as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

This operation does not provide any memory ordering semantics and thus is a relaxed operation.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

mbarrier.expect_tx.b64                       [addr], 32;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj1], 512;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj2], 512;\n

9.7.12.15.12. Parallel Synchronization and Communication Instructions: mbarrier.complete_tx \uf0c1

mbarrier.complete_tx

Perfoms complete-tx\noperation on the mbarrier object.

Syntax

mbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n

Description

A thread executing mbarrier.complete_tx performs a complete-tx\noperation on the mbarrier object at the location specified by the address operand addr. The\n32-bit unsigned integer operand txCount specifies the completeCount argument to the\ncomplete-tx operation.

mbarrier.complete_tx does not involve any asynchronous memory operations and only simulates the\ncompletion of an asynchronous memory operation and its side effect of signaling to the mbarrier\nobject.

Supported addressing modes for operand addr are as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

This operation does not provide any memory ordering semantics and thus is a relaxed operation.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

mbarrier.complete_tx.b64             [addr],     32;\nmbarrier.complete_tx.shared.b64      [mbarObj1], 512;\nmbarrier.complete_tx.relaxed.cta.b64 [addr2],    32;\n

9.7.12.15.13. Parallel Synchronization and Communication Instructions: mbarrier.arrive \uf0c1

mbarrier.arrive

Performs arrive-on operation on the\nmbarrier object.

Syntax

mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\nmbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n

Description

A thread executing mbarrier.arrive performs an arrive-on operation\non the mbarrier object at the location specified by the address operand addr. The 32-bit\nunsigned integer operand count specifies the count argument to the arrive-on\noperation.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

The optional qualifier .expect_tx specifies that an expect-tx\noperation is performed prior to the arrive-on\noperation. The 32-bit unsigned integer operand txCount specifies the expectCount argument to\nthe expect-tx operation. When both qualifiers .arrive and .expect_tx are specified, then\nthe count argument of the arrive-on operation is assumed to be 1.

A mbarrier.arrive operation with .noComplete qualifier must not cause the mbarrier to\ncomplete its current phase, otherwise the behavior is undefined.

The value of the operand count must be in the range as specified in Contents of the mbarrier\nobject.

Note: for sm_8x, when the argument count is specified, the modifier .noComplete is\nrequired.

mbarrier.arrive operation on an mbarrier object located in .shared::cta returns an opaque\n64-bit register capturing the phase of the mbarrier object prior to the arrive-on operation in the\ndestination operand state. Contents of the state operand are implementation\nspecific. Optionally, sink symbol '_' can be used for the state argument.

mbarrier.arrive operation on an mbarrier object located in .shared::cluster but not in\n.shared::cta cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand for\nsuch cases.

The optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\nConsistency Model. If the .sem qualifier is absent,\n.release is assumed by default.

The optional .scope qualifier indicates the set of threads that directly observe the memory\nsynchronizing effect of this operation, as described in the Memory Consistency Model. If the .scope qualifier is not specified then it\ndefaults to .cta. In contrast, the .shared::<scope> indicates the state space where the\nmbarrier resides.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sink symbol \u2018_\u2019 as the destination operand is introduced in PTX ISA version 7.1.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Support for count argument without the modifier .noComplete introduced in PTX ISA version\n7.8.

Support for sub-qualifier ::cluster introduced in PTX ISA version 8.0.

Support for qualifier .expect_tx is introduced in PTX ISA version 8.0.

Support for .scope and .sem qualifiers introduced in PTX ISA version 8.0

Target ISA Notes

Requires sm_80 or higher.

Support for count argument without the modifier .noComplete requires sm_90 or higher.

Qualifier .expect_tx requires sm_90 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for .cluster scope requires sm_90 or higher.

Examples

.reg .b32 cnt, remoteAddr32, remoteCTAId, addr32;\n.reg .b64 %r<3>, addr, remoteAddr64;\n.shared .b64 shMem, shMem2;\n\ncvta.shared.u64            addr, shMem2;\nmov.b32                    addr32, shMem2;\nmapa.shared::cluster.u32   remoteAddr32, addr32, remoteCTAId;\nmapa.u64                   remoteAddr64, addr,   remoteCTAId;\n\ncvta.shared.u64          addr, shMem2;\n\nmbarrier.arrive.shared.b64                       %r0, [shMem];\nmbarrier.arrive.shared::cta.b64                  %r0, [shMem2];\nmbarrier.arrive.release.cta.shared::cluster.b64  _, [remoteAddr32];\nmbarrier.arrive.release.cluster.b64              _, [remoteAddr64], cnt;\nmbarrier.arrive.expect_tx.release.cluster.b64    _, [remoteAddr64], tx_count;\nmbarrier.arrive.noComplete.b64                   %r1, [addr], 2;\nmbarrier.arrive.b64                              %r2, [addr], cnt;\n

9.7.12.15.14. Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop \uf0c1

mbarrier.arrive_drop

Decrements the expected count of the mbarrier object and performs arrive-on operation.

Syntax

mbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\nmbarrier.arrive_drop.expect_tx{.shared::cluster}{.sem}{.scope}.b64   _, [addr], tx_count;\nmbarrier.arrive_drop.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state,  [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n

Description

A thread executing mbarrier.arrive_drop on the mbarrier object at the location specified by\nthe address operand addr performs the following steps:

Decrements the expected arrival count of the mbarrier object by the value specified by the\n32-bit integer operand count. If count operand is not specified, it defaults to 1.
Performs an arrive-on operation on the\nmbarrier object. The operand count specifies the count argument to the arrive-on\noperation.

The decrement done in the expected arrivals count of the mbarrier object will be for all the\nsubsequent phases of the mbarrier object.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

mbarrier.arrive_drop operation forms the release pattern as described in the Memory\nConsistency Model and synchronizes with the acquire patterns.

The optional .scope qualifier indicates the set of threads that an mbarrier.arrive_drop\ninstruction can directly synchronize. If the .scope qualifier is not specified then it defaults\nto .cta. In contrast, the .shared::<scope> indicates the state space where the mbarrier\nresides.

A mbarrier.arrive_drop with .noComplete qualifier must not complete the mbarrier,\notherwise the behavior is undefined.

The value of the operand count must be in the range as specified in Contents of the mbarrier\nobject.

Note: for sm_8x, when the argument count is specified, the modifier .noComplete is\nrequired.

A thread that wants to either exit or opt out of participating in the arrive-on operation can use\nmbarrier.arrive_drop to drop itself from the mbarrier.

mbarrier.arrive_drop operation on an mbarrier object located in .shared::cta returns an\nopaque 64-bit register capturing the phase of the mbarrier object prior to the arrive-on\noperation\nin the destination operand state. Contents of the returned state are implementation\nspecific. Optionally, sink symbol '_' can be used for the state argument.

mbarrier.arrive_drop operation on an mbarrier object located in .shared::cluster but not\nin .shared::cta cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand\nfor such cases.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Support for count argument without the modifier .noComplete introduced in PTX ISA version\n7.8.

Support for qualifier .expect_tx is introduced in PTX ISA version 8.0.

Support for sub-qualifier ::cluster introduced in PTX ISA version 8.0.

Support for .scope and .sem qualifiers introduced in PTX ISA version 8.0

Target ISA Notes

Requires sm_80 or higher.

Support for count argument without the modifier .noComplete requires sm_90 or higher.

Qualifier .expect_tx requires sm_90 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for .cluster scope requires sm_90 or higher.

Examples

.reg .b32 cnt;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\n// Example 1\n@p mbarrier.arrive_drop.shared.b64 _, [shMem];\n@p exit;\n@p2 mbarrier.arrive_drop.noComplete.shared.b64 _, [shMem], %a;\n@p2 exit;\n..\n@!p mbarrier.arrive.shared.b64   %r1, [shMem];\n@!p mbarrier.test_wait.shared.b64  q, [shMem], %r1;\n\n// Example 2\nmbarrier.arrive_drop.shared::cluster.b64 _, [addr];\nmbarrier.arrive_drop.shared::cta.release.cluster.b64     _, [addr], cnt;\n\n// Example 3\nmbarrier.arrive_drop.expect_tx.shared::cta.release.cta.b64 state, [addr], tx_count;\n

9.7.12.15.15. Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive \uf0c1

cp.async.mbarrier.arrive

Makes the mbarrier object track all prior cp.async operations initiated by the\nexecuting thread.

Syntax

cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr];\n

Description

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Examples

// Example 1: no .noinc\nmbarrier.init.shared.b64 [shMem], threadCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n....\n// Absence of .noinc accounts for arrive-on from completion of prior cp.async operations.\n// So mbarrier.init must only account for arrive-on from mbarrier.arrive.\ncp.async.mbarrier.arrive.shared.b64 [shMem];\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n\n\n\n// Example 2: with .noinc\n\n// Tracks arrive-on from mbarrier.arrive and cp.async.mbarrier.arrive.\n\n// All threads participating in the mbarrier perform cp.async\nmov.b32 copyOperationCnt, threadCount;\n\n// 3 arrive-on operations will be triggered per-thread\nmul.lo.u32 copyArrivalCnt, copyOperationCnt, 3;\n\nadd.u32 totalCount, threadCount, copyArrivalCnt;\n\nmbarrier.init.shared.b64 [shMem], totalCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n...\n// Presence of .noinc requires mbarrier initalization to have accounted for arrive-on from cp.async\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 1st instance\n....\ncp.async.ca.shared.global [shard3], [gbl3], 4;\ncp.async.ca.shared.global [shard4], [gbl4], 16;\ncp.async.mbarrier.arrive.noinc.shared::cta.b64 [shMem]; // 2nd instance\n....\ncp.async.ca.shared.global [shard5], [gbl5], 4;\ncp.async.cg.shared.global [shard6], [gbl6], 16;\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 3rd and last instance\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n

9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait \uf0c1

mbarrier.test_wait/mbarrier.try_wait

Checks whether the mbarrier object has completed the phase.

Syntax

mbarrier.test_wait{.sem}{.scope}{.shared{::cta}}.b64        waitComplete, [addr], state;\nmbarrier.test_wait.parity{.sem}{.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity;\n\nmbarrier.try_wait{.sem}{.scope}{.shared{::cta}}.b64         waitComplete, [addr], state\n                                                               {, suspendTimeHint};\n\nmbarrier.try_wait{.sem}{.scope}.parity{.shared{::cta}}.b64  waitComplete, [addr], phaseParity\n                                                               {, suspendTimeHint};\n\n.sem   = { .acquire }\n.scope = { .cta, .cluster }\n

Description

The test_wait and try_wait operations test for the completion of the current or the immediately\npreceding phase of an mbarrier object at the location specified by the operand addr.

mbarrier.test_wait is a non-blocking instruction which tests for the completion of the phase.

mbarrier.try_wait is a potentially blocking instruction which tests for the completion of the\nphase. If the phase is not complete, the executing thread may be suspended. Suspended thread resumes\nexecution when the specified phase completes OR before the phase completes following a\nsystem-dependent time limit. The optional 32-bit unsigned integer operand suspendTimeHint\nspecifies the time limit, in nanoseconds, that may be used for the time limit instead of the\nsystem-dependent limit.

mbarrier.test_wait and mbarrier.try_wait test for completion of the phase :

Specified by the operand state, which was returned by an mbarrier.arrive instruction on\nthe same mbarrier object during the current or the immediately preceding phase. Or
Indicated by the operand phaseParity, which is the integer parity of either the current phase\nor the immediately preceding phase of the mbarrier object.

The .parity variant of the instructions test for the completion of the phase indicated by the\noperand phaseParity, which is the integer parity of either the current phase or the immediately\npreceding phase of the mbarrier object. An even phase has integer parity 0 and an odd phase has\ninteger parity of 1. So the valid values of phaseParity operand are 0 and 1.

Note: the use of the .parity variants of the instructions requires tracking the phase of an\nmbarrier object throughout its lifetime.

The test_wait and try_wait operations are valid only for :

the current incomplete phase, for which waitComplete returns False.
the immediately preceding phase, for which waitComplete returns True.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

When mbarrier.test_wait and mbarrier.try_wait operations return True, they form the\nacquire pattern as described in the Memory Consistency Model.

The optional .scope qualifier indicates the set of threads that the mbarrier.test_wait and\nmbarrier.try_wait instructions can directly synchronize. If the .scope qualifier is not\nspecified then it defaults to .cta. In contrast, the .shared::<scope> indicates the state\nspace where the mbarrier resides.

The following ordering of memory operations hold for the executing thread when\nmbarrier.test_wait or mbarrier.try_wait returns True :

All memory accesses (except async operations ) requested prior, in program\norder, to mbarrier.arrive during the completed phase by the participating threads of the CTA\nare performed and are visible to the executing thread.
All cp.async operations\nrequested prior, in program order, to cp.async.mbarrier.arrive during the completed phase by\nthe participating threads of the CTA are performed and made visible to the executing thread.
All cp.async.bulk asynchronous operations using the same mbarrier object requested prior,\nin program order, to mbarrier.arrive during the completed phase by the participating threads\nof the CTA are performed and made visible to the executing thread.
All memory accesses requested after the mbarrier.test_wait or mbarrier.try_wait, in\nprogram order, are not performed and not visible to memory accesses performed prior to\nmbarrier.arrive, in program order, by other threads participating in the mbarrier.
There is no ordering and visibility guarantee for memory accesses requested by the thread after\nmbarrier.arrive and prior to mbarrier.test_wait, in program order.

PTX ISA Notes

mbarrier.test_wait introduced in PTX ISA version 7.0.

Modifier .parity is introduced in PTX ISA version 7.1.

mbarrier.try_wait introduced in PTX ISA version 7.8.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Support for .scope and .sem qualifiers introduced in PTX ISA version 8.0

Target ISA Notes

mbarrier.test_wait requires sm_80 or higher.

mbarrier.try_wait requires sm_90 or higher.

Support for .cluster scope requires sm_90 or higher.

Examples

// Example 1a, thread synchronization with test_wait:\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.test_wait.shared.b64    complete, [shMem], %r1;\n@!complete nanosleep.u32 20;\n@!complete bra waitLoop;\n\n// Example 1b, thread synchronization with try_wait :\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.try_wait.shared.b64    complete, [shMem], %r1;\n@!complete bra waitLoop;\n\n\n// Example 2, thread synchronization using phase parity :\n\n.reg .b32 i, parArg;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmov.b32 i, 0;\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nloopStart :                           // One phase per loop iteration\n    ...\n    mbarrier.arrive.shared.b64  %r1, [shMem]; // N threads\n    ...\n    and.b32 parArg, i, 1;\n    waitLoop:\n    mbarrier.test_wait.parity.shared.b64  complete, [shMem], parArg;\n    @!complete nanosleep.u32 20;\n    @!complete bra waitLoop;\n    ...\n    add.u32 i, i, 1;\n    setp.lt.u32 p, i, IterMax;\n@p bra loopStart;\n\n\n// Example 3, Asynchronous copy completion waiting :\n\n.reg .b64 state;\n.shared .b64 shMem2;\n.shared .b64 shard1, shard2;\n.global .b64 gbl1, gbl2;\n\nmbarrier.init.shared.b64 [shMem2], threadCount;\n...\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n\n// Absence of .noinc accounts for arrive-on from prior cp.async operation\ncp.async.mbarrier.arrive.shared.b64 [shMem2];\n...\nmbarrier.arrive.shared.b64 state, [shMem2];\n\nwaitLoop:\nmbarrier.test_wait.shared::cta.b64 p, [shMem2], state;\n@!p bra waitLoop;\n\n// Example 4, Synchronizing the CTA0 threads with cluster threads\n.reg .b64 %r1, addr, remAddr;\n.shared .b64 shMem;\n\ncvta.shared.u64          addr, shMem;\nmapa.u64                 remAddr, addr, 0;     // CTA0\u2019s shMem instance\n\n// One thread from CTA0 executing the below initialization operation\n@p0 mbarrier.init.shared::cta.b64 [shMem], N;  // N = no of cluster threads\n\nbarrier.cluster.arrive;\nbarrier.cluster.wait;\n\n// Entire cluster executing the below arrive operation\nmbarrier.arrive.release.cluster.b64              _, [remAddr];\n\n// computation not requiring mbarrier synchronization ...\n\n// Only CTA0 threads executing the below wait operation\nwaitLoop:\nmbarrier.try_wait.parity.acquire.cluser.shared::cta.b64  complete, [shMem], 0;\n@!complete bra waitLoop;\n

9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count \uf0c1

mbarrier.pending_count

Query the pending arrival count from the opaque mbarrier state.

Syntax

mbarrier.pending_count.b64 count, state;\n

Description

The pending count can be queried from the opaque mbarrier state using mbarrier.pending_count.

The state operand is a 64-bit register that must be the result of a prior\nmbarrier.arrive.noComplete or mbarrier.arrive_drop.noComplete instruction. Otherwise, the\nbehavior is undefined.

The destination register count is a 32-bit unsigned integer representing the pending count of\nthe mbarrier object prior to the arrive-on operation from\nwhich the state register was obtained.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_80 or higher.

Examples

.reg .b32 %r1;\n.reg .b64 state;\n.shared .b64 shMem;\n\nmbarrier.arrive.noComplete.b64 state, [shMem], 1;\nmbarrier.pending_count.b64 %r1, state;\n

Parallel Synchronization and Communication Instructions: mbarrier.arrive

\n\n\n

Performs arrive-on operation on the\nmbarrier object.

Syntax

mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\nmbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n

Description

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

A mbarrier.arrive operation with .noComplete qualifier must not cause the mbarrier to\ncomplete its current phase, otherwise the behavior is undefined.

The value of the operand count must be in the range as specified in Contents of the mbarrier\nobject.

Note: for sm_8x, when the argument count is specified, the modifier .noComplete is\nrequired.

The optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\nConsistency Model. If the .sem qualifier is absent,\n.release is assumed by default.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sink symbol \u2018_\u2019 as the destination operand is introduced in PTX ISA version 7.1.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Support for count argument without the modifier .noComplete introduced in PTX ISA version\n7.8.

Support for sub-qualifier ::cluster introduced in PTX ISA version 8.0.

Support for qualifier .expect_tx is introduced in PTX ISA version 8.0.

Support for .scope and .sem qualifiers introduced in PTX ISA version 8.0

Target ISA Notes

Requires sm_80 or higher.

Support for count argument without the modifier .noComplete requires sm_90 or higher.

Qualifier .expect_tx requires sm_90 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for .cluster scope requires sm_90 or higher.

Examples

.reg .b32 cnt, remoteAddr32, remoteCTAId, addr32;\n.reg .b64 %r<3>, addr, remoteAddr64;\n.shared .b64 shMem, shMem2;\n\ncvta.shared.u64            addr, shMem2;\nmov.b32                    addr32, shMem2;\nmapa.shared::cluster.u32   remoteAddr32, addr32, remoteCTAId;\nmapa.u64                   remoteAddr64, addr,   remoteCTAId;\n\ncvta.shared.u64          addr, shMem2;\n\nmbarrier.arrive.shared.b64                       %r0, [shMem];\nmbarrier.arrive.shared::cta.b64                  %r0, [shMem2];\nmbarrier.arrive.release.cta.shared::cluster.b64  _, [remoteAddr32];\nmbarrier.arrive.release.cluster.b64              _, [remoteAddr64], cnt;\nmbarrier.arrive.expect_tx.release.cluster.b64    _, [remoteAddr64], tx_count;\nmbarrier.arrive.noComplete.b64                   %r1, [addr], 2;\nmbarrier.arrive.b64                              %r2, [addr], cnt;\n

Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop

\n\n\n

Decrements the expected count of the mbarrier object and performs arrive-on operation.

Syntax

mbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\nmbarrier.arrive_drop.expect_tx{.shared::cluster}{.sem}{.scope}.b64   _, [addr], tx_count;\nmbarrier.arrive_drop.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state,  [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n

Description

A thread executing mbarrier.arrive_drop on the mbarrier object at the location specified by\nthe address operand addr performs the following steps:

Decrements the expected arrival count of the mbarrier object by the value specified by the\n32-bit integer operand count. If count operand is not specified, it defaults to 1.
Performs an arrive-on operation on the\nmbarrier object. The operand count specifies the count argument to the arrive-on\noperation.

The decrement done in the expected arrivals count of the mbarrier object will be for all the\nsubsequent phases of the mbarrier object.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

mbarrier.arrive_drop operation forms the release pattern as described in the Memory\nConsistency Model and synchronizes with the acquire patterns.

A mbarrier.arrive_drop with .noComplete qualifier must not complete the mbarrier,\notherwise the behavior is undefined.

The value of the operand count must be in the range as specified in Contents of the mbarrier\nobject.

Note: for sm_8x, when the argument count is specified, the modifier .noComplete is\nrequired.

A thread that wants to either exit or opt out of participating in the arrive-on operation can use\nmbarrier.arrive_drop to drop itself from the mbarrier.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Support for count argument without the modifier .noComplete introduced in PTX ISA version\n7.8.

Support for qualifier .expect_tx is introduced in PTX ISA version 8.0.

Support for sub-qualifier ::cluster introduced in PTX ISA version 8.0.

Support for .scope and .sem qualifiers introduced in PTX ISA version 8.0

Target ISA Notes

Requires sm_80 or higher.

Support for count argument without the modifier .noComplete requires sm_90 or higher.

Qualifier .expect_tx requires sm_90 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for .cluster scope requires sm_90 or higher.

Examples

.reg .b32 cnt;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\n// Example 1\n@p mbarrier.arrive_drop.shared.b64 _, [shMem];\n@p exit;\n@p2 mbarrier.arrive_drop.noComplete.shared.b64 _, [shMem], %a;\n@p2 exit;\n..\n@!p mbarrier.arrive.shared.b64   %r1, [shMem];\n@!p mbarrier.test_wait.shared.b64  q, [shMem], %r1;\n\n// Example 2\nmbarrier.arrive_drop.shared::cluster.b64 _, [addr];\nmbarrier.arrive_drop.shared::cta.release.cluster.b64     _, [addr], cnt;\n\n// Example 3\nmbarrier.arrive_drop.expect_tx.shared::cta.release.cta.b64 state, [addr], tx_count;\n

Parallel Synchronization and Communication Instructions: mbarrier.complete_tx

\n\n\n

Perfoms complete-tx\noperation on the mbarrier object.

Syntax

mbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n

Description

Supported addressing modes for operand addr are as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

This operation does not provide any memory ordering semantics and thus is a relaxed operation.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

mbarrier.complete_tx.b64             [addr],     32;\nmbarrier.complete_tx.shared.b64      [mbarObj1], 512;\nmbarrier.complete_tx.relaxed.cta.b64 [addr2],    32;\n

Parallel Synchronization and Communication Instructions: mbarrier.expect_tx

\n\n\n

Perfoms expect-tx operation on the mbarrier object.

Syntax

mbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n

Description

Supported addressing modes for operand addr are as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

This operation does not provide any memory ordering semantics and thus is a relaxed operation.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90 or higher.

Examples

mbarrier.expect_tx.b64                       [addr], 32;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj1], 512;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj2], 512;\n

Parallel Synchronization and Communication Instructions: mbarrier.init

\n\n\n

Initialize the mbarrier object.

Syntax

mbarrier.init{.shared{::cta}}.b64 [addr], count;\n

Description

Initialization of the mbarrier object involves :

Initializing the current phase to 0.
Initializing the expected arrival count to count.
Initializing the pending arrival count to count.
Initializing the tx-count to 0.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Examples

.shared .b64 shMem, shMem2;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n\ncvta.shared.u64          addr, shMem2;\nmbarrier.init.b64        [addr],   %r1;\nbar.cta.sync             0;\n// ... other mbarrier operations on addr\n\nmbarrier.init.shared::cta.b64 [shMem], 12;\nbar.sync                 0;\n// ... other mbarrier operations on shMem\n

Parallel Synchronization and Communication Instructions: mbarrier.inval

\n\n\n

Invalidates the mbarrier object.

Syntax

mbarrier.inval{.shared{::cta}}.b64 [addr];\n

Description

mbarrier.inval invalidates the mbarrier object at the location specified by the address\noperand addr.

An mbarrier object must be invalidated before using its memory location for any other purpose.

Performing any mbarrier operation except mbarrier.init on an invalidated mbarrier object\nresults in undefined behaviour.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_80 or higher.

Examples

.shared .b64 shmem;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n.reg    .pred t0;\n\n// Example 1 :\nbar.sync                      0;\n@t0 mbarrier.init.b64     [addr], %r1;\n// ... other mbarrier operations on addr\nbar.sync                      0;\n@t0 mbarrier.inval.b64    [addr];\n\n\n// Example 2 :\nbar.cta.sync                  0;\nmbarrier.init.shared.b64           [shmem], 12;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared.b64      [shmem];\n\n// shmem can be reused here for unrelated use :\nbar.cta.sync                  0;\nst.shared.b64                      [shmem], ...;\n\n// shmem can be re-initialized as mbarrier object :\nbar.cta.sync                  0;\n@t0 mbarrier.init.shared.b64       [shmem], 24;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared::cta.b64 [shmem];\n

Parallel Synchronization and Communication Instructions: mbarrier.pending_count

\n\n\n

Query the pending arrival count from the opaque mbarrier state.

Syntax

mbarrier.pending_count.b64 count, state;\n

Description

The pending count can be queried from the opaque mbarrier state using mbarrier.pending_count.

The state operand is a 64-bit register that must be the result of a prior\nmbarrier.arrive.noComplete or mbarrier.arrive_drop.noComplete instruction. Otherwise, the\nbehavior is undefined.

The destination register count is a 32-bit unsigned integer representing the pending count of\nthe mbarrier object prior to the arrive-on operation from\nwhich the state register was obtained.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_80 or higher.

Examples

.reg .b32 %r1;\n.reg .b64 state;\n.shared .b64 shMem;\n\nmbarrier.arrive.noComplete.b64 state, [shMem], 1;\nmbarrier.pending_count.b64 %r1, state;\n

Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait

\n\n\n

Checks whether the mbarrier object has completed the phase.

Syntax

mbarrier.test_wait{.sem}{.scope}{.shared{::cta}}.b64        waitComplete, [addr], state;\nmbarrier.test_wait.parity{.sem}{.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity;\n\nmbarrier.try_wait{.sem}{.scope}{.shared{::cta}}.b64         waitComplete, [addr], state\n                                                               {, suspendTimeHint};\n\nmbarrier.try_wait{.sem}{.scope}.parity{.shared{::cta}}.b64  waitComplete, [addr], phaseParity\n                                                               {, suspendTimeHint};\n\n.sem   = { .acquire }\n.scope = { .cta, .cluster }\n

Description

The test_wait and try_wait operations test for the completion of the current or the immediately\npreceding phase of an mbarrier object at the location specified by the operand addr.

mbarrier.test_wait is a non-blocking instruction which tests for the completion of the phase.

mbarrier.test_wait and mbarrier.try_wait test for completion of the phase :

Specified by the operand state, which was returned by an mbarrier.arrive instruction on\nthe same mbarrier object during the current or the immediately preceding phase. Or
Indicated by the operand phaseParity, which is the integer parity of either the current phase\nor the immediately preceding phase of the mbarrier object.

Note: the use of the .parity variants of the instructions requires tracking the phase of an\nmbarrier object throughout its lifetime.

The test_wait and try_wait operations are valid only for :

the current incomplete phase, for which waitComplete returns False.
the immediately preceding phase, for which waitComplete returns True.

Supported addressing modes for operand addr is as described in Addresses as Operands. Alignment for operand addr is as described in the Size\nand alignment of mbarrier object.

When mbarrier.test_wait and mbarrier.try_wait operations return True, they form the\nacquire pattern as described in the Memory Consistency Model.

The following ordering of memory operations hold for the executing thread when\nmbarrier.test_wait or mbarrier.try_wait returns True :

All memory accesses (except async operations ) requested prior, in program\norder, to mbarrier.arrive during the completed phase by the participating threads of the CTA\nare performed and are visible to the executing thread.
All cp.async operations\nrequested prior, in program order, to cp.async.mbarrier.arrive during the completed phase by\nthe participating threads of the CTA are performed and made visible to the executing thread.
All cp.async.bulk asynchronous operations using the same mbarrier object requested prior,\nin program order, to mbarrier.arrive during the completed phase by the participating threads\nof the CTA are performed and made visible to the executing thread.
All memory accesses requested after the mbarrier.test_wait or mbarrier.try_wait, in\nprogram order, are not performed and not visible to memory accesses performed prior to\nmbarrier.arrive, in program order, by other threads participating in the mbarrier.
There is no ordering and visibility guarantee for memory accesses requested by the thread after\nmbarrier.arrive and prior to mbarrier.test_wait, in program order.

PTX ISA Notes

mbarrier.test_wait introduced in PTX ISA version 7.0.

Modifier .parity is introduced in PTX ISA version 7.1.

mbarrier.try_wait introduced in PTX ISA version 7.8.

Support for sub-qualifier ::cta on .shared introduced in PTX ISA version 7.8.

Support for .scope and .sem qualifiers introduced in PTX ISA version 8.0

Target ISA Notes

mbarrier.test_wait requires sm_80 or higher.

mbarrier.try_wait requires sm_90 or higher.

Support for .cluster scope requires sm_90 or higher.

Examples

// Example 1a, thread synchronization with test_wait:\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.test_wait.shared.b64    complete, [shMem], %r1;\n@!complete nanosleep.u32 20;\n@!complete bra waitLoop;\n\n// Example 1b, thread synchronization with try_wait :\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.try_wait.shared.b64    complete, [shMem], %r1;\n@!complete bra waitLoop;\n\n\n// Example 2, thread synchronization using phase parity :\n\n.reg .b32 i, parArg;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmov.b32 i, 0;\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nloopStart :                           // One phase per loop iteration\n    ...\n    mbarrier.arrive.shared.b64  %r1, [shMem]; // N threads\n    ...\n    and.b32 parArg, i, 1;\n    waitLoop:\n    mbarrier.test_wait.parity.shared.b64  complete, [shMem], parArg;\n    @!complete nanosleep.u32 20;\n    @!complete bra waitLoop;\n    ...\n    add.u32 i, i, 1;\n    setp.lt.u32 p, i, IterMax;\n@p bra loopStart;\n\n\n// Example 3, Asynchronous copy completion waiting :\n\n.reg .b64 state;\n.shared .b64 shMem2;\n.shared .b64 shard1, shard2;\n.global .b64 gbl1, gbl2;\n\nmbarrier.init.shared.b64 [shMem2], threadCount;\n...\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n\n// Absence of .noinc accounts for arrive-on from prior cp.async operation\ncp.async.mbarrier.arrive.shared.b64 [shMem2];\n...\nmbarrier.arrive.shared.b64 state, [shMem2];\n\nwaitLoop:\nmbarrier.test_wait.shared::cta.b64 p, [shMem2], state;\n@!p bra waitLoop;\n\n// Example 4, Synchronizing the CTA0 threads with cluster threads\n.reg .b64 %r1, addr, remAddr;\n.shared .b64 shMem;\n\ncvta.shared.u64          addr, shMem;\nmapa.u64                 remAddr, addr, 0;     // CTA0\u2019s shMem instance\n\n// One thread from CTA0 executing the below initialization operation\n@p0 mbarrier.init.shared::cta.b64 [shMem], N;  // N = no of cluster threads\n\nbarrier.cluster.arrive;\nbarrier.cluster.wait;\n\n// Entire cluster executing the below arrive operation\nmbarrier.arrive.release.cluster.b64              _, [remAddr];\n\n// computation not requiring mbarrier synchronization ...\n\n// Only CTA0 threads executing the below wait operation\nwaitLoop:\nmbarrier.try_wait.parity.acquire.cluser.shared::cta.b64  complete, [shMem], 0;\n@!complete bra waitLoop;\n

", "tooltip": "=====Parallel Synchronization and Communication Instructions: mbarrier\n\n\n\nSynchronizing any subset of threads within a CTA\n\nOne-way synchronization of threads across CTAs of a cluster. As noted in mbarrier support with\n\nshared memory, threads can\n\nperform only arrive operations but not *_wait on an mbarrier located in shared::cluster\n\nspace.\n\nWaiting for completion of asynchronous memory operations initiated by a thread and making them\n\nvisible to other threads.\n\nAn mbarrier o...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.arrive\n\n\n\nPerforms arrive-on operation on the\n\nmbarrier object.\n\nSyntax\n\nmbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64 state, [addr]{, count};\n\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64 _, [addr] {,count}\n\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\n\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64 _, [addr], txCount;\n\nmbarrier....\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop\n\n\n\nDecrements the expected count of the mbarrier object and performs arrive-on operation.\n\nSyntax\n\nmbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state, [addr]{, count};\n\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64 _, [addr] {,count};\n\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\n\nmbarrier.arrive_drop.expect_tx{.shared...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.complete_tx\n\n\n\nPerfoms complete-tx\n\noperation on the mbarrier object.\n\nSyntax\n\nmbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem = { .relaxed }\n\n.scope = { .cta, .cluster }\n\n.space = { .shared{::cta}, .shared::cluster }\n\nDescription\n\nA thread executing mbarrier.complete_tx performs a complete-tx\n\noperation on the mbarrier object at the location specified by the address operand addr. The\n\n32-bit unsig...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.expect_tx\n\n\n\nPerfoms expect-tx operation on the mbarrier object.\n\nSyntax\n\nmbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem = { .relaxed }\n\n.scope = { .cta, .cluster }\n\n.space = { .shared{::cta}, .shared::cluster }\n\nDescription\n\nA thread executing mbarrier.expect_tx performs an expect-tx\n\noperation on the mbarrier object at the location specified by the address operand addr. The\n\n32-bit unsigned int...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.init\n\n\n\nInitialize the mbarrier object.\n\nSyntax\n\nmbarrier.init{.shared{::cta}}.b64 [addr], count;\n\nDescription\n\nmbarrier.init initializes the mbarrier object at the location specified by the address operand\n\naddr with the unsigned 32-bit integer count. The value of operand count must be in the range\n\nas specified in Contents of the mbarrier object.\n\nInitialization of the mbarrier object involves :\n\nInitializing t...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.inval\n\n\n\nInvalidates the mbarrier object.\n\nSyntax\n\nmbarrier.inval{.shared{::cta}}.b64 [addr];\n\nDescription\n\nmbarrier.inval invalidates the mbarrier object at the location specified by the address\n\noperand addr.\n\nAn mbarrier object must be invalidated before using its memory location for any other purpose.\n\nPerforming any mbarrier operation except mbarrier.init on an invalidated mbarrier object\n\nresults in undefine...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.pending_count\n\n\n\nQuery the pending arrival count from the opaque mbarrier state.\n\nSyntax\n\nmbarrier.pending_count.b64 count, state;\n\nDescription\n\nThe pending count can be queried from the opaque mbarrier state using mbarrier.pending_count.\n\nThe state operand is a 64-bit register that must be the result of a prior\n\nmbarrier.arrive.noComplete or mbarrier.arrive_drop.noComplete instruction. Otherwise, the\n\nbehavior is undefi...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait\n\n\n\nChecks whether the mbarrie ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier" }; case "membar/fence": return { "html": "For more information, visit membar/fence .

Parallel Synchronization and Communication Instructions: membar/fence

\n\n\n

Enforce an ordering of memory operations.

Syntax

fence{.sem}.scope;\nfence.op_restrict.release.cluster;\nfence.proxy.proxykind;\nmembar.level;\nmembar.proxy.proxykind;\n\n.sem       = { .sc, .acq_rel };\n.scope     = { .cta, .cluster, .gpu, .sys };\n.level     = { .cta, .gl, .sys };\n.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };\n.op_restrict = { .mbarrier_init };\n

Description

The membar instruction guarantees that prior memory accesses requested by this thread (ld,\nst, atom and red instructions) are performed at the specified level, before later\nmemory operations requested by this thread following the membar instruction. The level\nqualifier specifies the set of threads that may observe the ordering effect of this operation.

A memory read (e.g., by ld or atom) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread at the indicated level. A memory\nwrite (e.g., by st, red or atom) has been performed when the value written has become\nvisible to other threads at the specified level, that is, when the previous value can no longer be\nread.

The fence instruction establishes an ordering between memory accesses requested by this thread\n(ld, st, atom and red instructions) as described in the Memory Consistency Model. The scope qualifier specifies the set of threads that may\nobserve the ordering effect of this operation.

fence.acq_rel is a light-weight fence that is sufficient for memory synchronization in most\nprograms. Instances of fence.acq_rel synchronize when combined with additional memory operations\nas described in acquire and release patterns in the Memory Consistency Model. If the optional .sem qualifier is absent, .acq_rel\nis assumed by default.

fence.sc is a slower fence that can restore sequential consistency when used in sufficient\nplaces, at the cost of performance. Instances of fence.sc with sufficient scope always\nsynchronize by forming a total order per scope, determined at runtime. This total order can be\nconstrained further by other synchronization in the program.

Qualifier .op_restrict restricts the class of prior memory operations for which the fence\ninstruction provides the memory ordering guarantees. When .op_restrict is .mbarrier_init,\nthe fence only applies to the prior mbarrier.init operations executed by the same thread on\nmbarrier objects in .shared::cta state space.

Qualifier .release indicates memory synchronization as described in the Memory Consistency\nModel.

On sm_70 and higher membar is a synonym for fence.sc¹, and the membar\nlevels cta, gl and sys are synonymous with the fence scopes cta, gpu and\nsys respectively.

membar.proxy and fence.proxy instructions establish an ordering between memory accesses that\nmay happen through different proxies. The type of proxy is indicated using the .proxykind\nqualifier. Value .alias of the .proxykind qualifier refers to memory accesses performed\nusing virtually aliased addresses to the same memory location. Value .async of the\n.proxykind qualifier specifies that the memory ordering is established between the async proxy\nand the generic proxy. The memory ordering is limited only to the state space specified. If no state\nspace is specified, then the memory ordering applies on all state spaces.

On sm_70 and higher, membar.proxy is a synonym for fence.proxy.

¹ The semantics of fence.sc introduced with sm_70 is a superset of the semantics of\nmembar and the two are compatible; when executing on sm_70 or later architectures,\nmembar acquires the full semantics of fence.sc.

PTX ISA Notes

membar.{cta,gl} introduced in PTX ISA version 1.4.

membar.sys introduced in PTX ISA version 2.0.

fence introduced in PTX ISA version 6.0.

membar.proxy and fence.proxy introduced in PTX ISA version 7.5.

.cluster scope qualifier introduced in PTX ISA version 7.8.

.op_restrict qualifier introduced in PTX ISA version 8.0.

fence.proxy.async is introduced in PTX ISA version 8.0.

Target ISA Notes

membar.{cta,gl} supported on all target architectures.

membar.sys requires sm_20 or higher.

fence requires sm_70 or higher.

membar.proxy requires sm_60 or higher.

fence.proxy requires sm_70 or higher.

.cluster scope qualifier requires sm_90 or higher.

.op_restrict qualifier requires sm_90 or higher.

fence.proxy.async requires sm_90 or higher.

Examples

membar.gl;\nmembar.cta;\nmembar.sys;\nfence.sc;\nfence.sc.cluster;\nfence.proxy.alias;\nmembar.proxy.alias;\nfence.mbarrier_init.release.cluster;\nfence.proxy.async;\nfence.proxy.async.shared::cta;\nfence.proxy.async.shared::cluster;\nfence.proxy.async.global;\n

", "tooltip": "Enforce an ordering of memory operations.\n\nSyntax\n\nfence{.sem}.scope;\n\nfence.op_restrict.release.cluster;\n\nfence.proxy.proxykind;\n\nmembar.level;\n\nmembar.proxy.proxykind;\n\n.sem = { .sc, .acq_rel };\n\n.scope = { .cta, .cluster, .gpu, .sys };\n\n.level = { .cta, .gl, .sys };\n\n.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };\n\n.op_restrict = { .mbarrier_init };\n\nDescription\n\nThe membar instruction guarantees that prior memory accesses requested by this thread (ld,\n\nst, atom and red instructions) are performed at the specified level, before later\n\nmemory operations requested by this thread following the membar instruction. The level\n\nqualifier specifies the set of threads that may observe the ordering effect of this operation.\n\nA memory read (e.g., by ld or atom) has been performed when the value read has been\n\ntransmitted from memory and cannot be modified by another thread at the indicated level. A memory\n\nwrite (e.g., by st, red or atom) has been performed when the value written has become\n\nvisible to other threads at the specified level, that is, when the previous value can no longer be\n\nread.\n\nThe fence instruction establishes an ordering between memory accesses requested by this thread\n\n(ld, st, atom and red instructions) as described in the Memory Consistency Model. The scope qualifier specifies the set of threads that may\n\nobserve the ordering effect of this operation.\n\nfence.acq_rel is a light-weight fence that is sufficient for memory synchronization in most\n\nprograms. Instances of fence.acq_rel synchronize when combined with additional memory operations\n\nas described in acquire and release patterns in the Memory Consistency Model. If the optional .sem qualifier is absent, .acq_rel\n\nis assumed by default.\n\nfence.sc is a slower fence that can restore sequential consistency when used in sufficient\n\nplaces, at the cost of performance. Instances of fence.sc with sufficient scope always\n\nsynchronize by forming a total order per scope, determined at runtime. This total order can be\n\nconstrained further by other synchronization in the program.\n\nQualifier .op_restrict restricts the class of prior memory operations for which the fence\n\ninstruction provides the memory ordering guarantees. When .op_restrict is .mbarrier_init,\n\nthe fence only applies to the prior mbarrier.init operations executed by the same thread on\n\nmbarrier objects in .shared::cta state space.\n\nQualifier .release indicates memory synchronization as described in the Memory Consistency\n\nModel.\n\nOn sm_70 and higher membar is a synonym for fence.sc1, and the membar\n\nlevels cta, gl and sys are synonymous with the fence scopes cta, gpu and\n\nsys respectively.\n\nmembar.proxy and fence.proxy instructions establish an ordering between memory accesses that\n\nmay happen through different proxies. The type of proxy is indicated using the .proxykind\n\nqualifier. Value .alias of the .proxykind qualifier refers to memory accesses performed\n\nusing virtually aliased addresses to the same memory location. Value .async of the\n\n.proxykind qualifier specifies that the memory ordering is established between the async proxy\n\nand the generic proxy. The memory ordering is limited only to the state space specified. If no state\n\nspace is specified, then the memory ordering applies on all state spaces.\n\nOn sm_70 and higher, membar.proxy is a synonym for fence.proxy.\n\n1 The semantics of fence.sc introduced with sm_70 is a superset of the semantics of\n\nmembar and the two are compatible; when executing on sm_70 or later architectures,\n\nmembar acquires the full semantics of fence.sc.\n\nPTX ISA Notes\n\nmembar.{cta,gl} introduced in PTX ISA version 1.4.\n\nmembar.sys introduced in PTX ISA version 2.0.\n\nfence introduced in PTX ISA version 6.0.\n\nmembar.proxy and fence.proxy introduced in PTX ISA version 7.5.\n\n.cluster scope qualifier introduced in PTX ISA version 7.8.\n\n.op_restrict qualifier introduced in PTX ISA version 8.0.\n\nfence.proxy.async is introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nmembar.{cta,gl} supported on ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence" }; case "min": return { "html": "For more information, visit min(fp) , min(fp16) , min(int) .

Floating Point Instructions: min

\n\n\n

Find the minimum of two values.

Syntax

min{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\nmin.f64                            d, a, b;\n

Description

Store the minimum of a and b in d.

If .NaN modifier is specified, then the result is canonical NaN if either of the inputs is\nNaN.

If .abs modifier is specified, the magnitude of destination operand d is the minimum of\nabsolute values of both the input arguments.

If .xorsign modifier is specified, the sign bit of destination d is equal to the XOR of the\nsign bits of both the inputs.

Modifiers .abs and .xorsign must be specified together and .xorsign considers the sign\nbit of both inputs before applying .abs operation.

If the result of min is NaN then the .xorsign and .abs modifiers will be ignored.

Semantics

if (.xorsign) {\n    xorsign = getSignBit(a) ^ getSignBit(b);\n    if (.abs) {\n        a = |a|;\n        b = |b|;\n   }\n}\nif (isNaN(a) && isNaN(b))                 d = NaN;\nelse if (.NaN && (isNaN(a) || isNaN(b)))  d = NaN;\nelse if (isNaN(a))                        d = b;\nelse if (isNaN(b))                        d = a;\nelse                                      d = (a < b) ? a : b;\nif (.xorsign && !isNaN(d)) {\n    setSignBit(d, xorsign);\n}\n

Notes

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

min.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

min.f64 supports subnormal numbers.

min.f32 flushes subnormal inputs and results to sign-preserving zero.

If values of both inputs are 0.0, then +0.0 > -0.0.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

min.NaNintroduced in PTX ISA version 7.0.

min.xorsign.abs introduced in PTX ISA version 7.2.

Target ISA Notes

min.f32 supported on all target architectures.

min.f64 requires sm_13 or higher.

min.NaNrequires sm_80 or higher.

min.xorsign.abs requires sm_86 or higher.

Examples

@p  min.ftz.f32  z,z,x;\n    min.f64      a,b,c;\n    // fp32 min with .NaN\n    min.NaN.f32  f0,f1,f2;\n    // fp32 min with .xorsign.abs\n    min.xorsign.abs.f32 Rd, Ra, Rb;\n

Half Precision Floating Point Instructions: min

\n\n\n

Find the minimum of two values.

Syntax

min{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\nmin{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\nmin{.NaN}{.xorsign.abs}.bf16           d, a, b;\nmin{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n

Description

Store the minimum of a and b in d.

If .NaN modifier is specified, then the result is canonical NaN if either of the inputs is\nNaN.

If .abs modifier is specified, the magnitude of destination operand d is the minimum of\nabsolute values of both the input arguments.

If .xorsign modifier is specified, the sign bit of destination d is equal to the XOR of the\nsign bits of both the inputs.

Modifiers .abs and .xorsign must be specified together and .xorsign considers the sign\nbit of both inputs before applying .abs operation.

If the result of min is NaN then the .xorsign and .abs modifiers will be ignored.

Semantics

if (type == f16 || type == bf16) {\n    if (.xorsign) {\n        xorsign = getSignBit(a) ^ getSignBit(b);\n        if (.abs) {\n            a = |a|;\n            b = |b|;\n        }\n    }\n    if (isNaN(a) && isNaN(b))              d = NaN;\n    if (.NaN && (isNaN(a) || isNaN(b)))    d = NaN;\n    else if (isNaN(a))                     d = b;\n    else if (isNaN(b))                     d = a;\n    else                                   d = (a < b) ? a : b;\n    if (.xorsign && !isNaN(d)) {\n         setSignBit(d, xorsign);\n    }\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n        if (.xorsign) {\n            xorsign = getSignBit(fA[i]) ^ getSignBit(fB[i]);\n            if (.abs) {\n               fA[i] = |fA[i]|;\n               fB[i] = |fB[i]|;\n           }\n        }\n        if (isNaN(fA[i]) && isNaN(fB[i]))              d[i] = NaN;\n        if (.NaN && (isNaN(fA[i]) || isNaN(fB[i])))    d[i] = NaN;\n        else if (isNaN(fA[i]))                         d[i] = fB[i];\n        else if (isNaN(fB[i]))                         d[i] = fA[i];\n        else                                           d[i] = (fA[i] < fB[i]) ? fA[i] : fB[i];\n        if (.xorsign && !isNaN(d[i])) {\n            setSignBit(d[i], xorsign);\n        }\n    }\n}\n

Notes

Subnormal numbers:: By default, subnormal numbers are supported.\nmin.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n

If values of both inputs are 0.0, then +0.0 > -0.0.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

min.xorsign introduced in PTX ISA version 7.2.

Target ISA Notes

Requires sm_80 or higher.

min.xorsign.abs support requires sm_86 or higher.

Examples

min.ftz.f16       h0,h1,h2;\nmin.f16x2         b0,b1,b2;\n// SIMD fp16 min with .NaN\nmin.NaN.f16x2     b0,b1,b2;\nmin.bf16          h0, h1, h2;\n// SIMD bf16 min with NaN\nmin.NaN.bf16x2    b0, b1, b2;\n// scalar bf16 min with xorsign.abs\nmin.xorsign.abs.bf16 Rd, Ra, Rb\n

Integer Arithmetic Instructions: min

\n\n\n

Find the minimum of two values.

Syntax

min.atype         d, a, b;\nmin{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n           .u16x2, .s16, .s64 };\n.btype = { .s16x2, .s32 };\n

Description

Store the minimum of a and b in d.

Operands d, a and b have the same type as the instruction type. For instruction types\n.u16x2, .s16x2, operands d, a and b have type .b32.

Semantics

if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = (iA[i] < iB[i]) ? iA[i] : iB[i];\n    }\n} else {\n    d = (a < b) ? a : b; // Integer (signed and unsigned)\n}\n

Notes

Signed and unsigned differ.

Saturation modifier:: min.relu.{s16x2, s32} clamps the result to 0 if negative.
\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

min.u16x2, min{.relu}.s16x2 and min.relu.s32 introduced in PTX ISA version 8.0.

Target ISA Notes

Supported on all target architectures.

min.u16x2, min{.relu}.s16x2 and min.relu.s32 require sm_90 or higher.

Examples

    min.s32  r0,a,b;\n@p  min.u16  h,i,j;\n    min.s16x2.relu u,v,w;\n

", "tooltip": "=====Floating Point Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f32 d, a, b;\n\nmin.f64 d, a, b;\n\nDescription\n\nStore the minimum of a and b in d.\n\nIf .NaN modifier is specified, then the result is canonical NaN if either of the inputs is\n\nNaN.\n\nIf .abs modifier is specified, the magnitude of destination operand d is the minimum of\n\nabsolute values of both the input argument...\n\n=====Half Precision Floating Point Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f16 d, a, b;\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f16x2 d, a, b;\n\nmin{.NaN}{.xorsign.abs}.bf16 d, a, b;\n\nmin{.NaN}{.xorsign.abs}.bf16x2 d, a, b;\n\nDescription\n\nStore the minimum of a and b in d.\n\nFor .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\n\nfrom source operands. Half-word o...\n\n=====Integer Arithmetic Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin.atype d, a, b;\n\nmin{.relu}.btype d, a, b;\n\n.atype = { .u16, .u32, .u64,\n\n .u16x2, .s16, .s64 };\n\n.btype = { .s16x2, .s32 };\n\nDescription\n\nStore the minimum of a and b in d.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min" }; case "minnctapersm": return { "html": "For more information, visit minnctapersm .

Performance-Tuning Directives: .minnctapersm

\n\n\n

Minimum number of CTAs per SM.

Syntax

.minnctapersm ncta\n

Description

Declare the minimum number of CTAs from the kernel\u2019s grid to be mapped to a single multiprocessor\n(SM).

Notes

Optimizations based on .minnctapersm need either .maxntid or .reqntid to be specified as\nwell.

If the total number of threads on a single SM resulting from .minnctapersm and .maxntid /\n.reqntid exceed maximum number of threads supported by an SM then directive .minnctapersm\nwill be ignored.

In PTX ISA version 2.1 or higher, a warning is generated if .minnctapersm is specified without\nspecifying either .maxntid or .reqntid.

PTX ISA Notes

Introduced in PTX ISA version 2.0 as a replacement for .maxnctapersm.

Target ISA Notes

Supported on all target architectures.

Examples

.entry foo .maxntid 256 .minnctapersm 4 { ... }\n

", "tooltip": "Minimum number of CTAs per SM.\n\nSyntax\n\n.minnctapersm ncta\n\nDescription\n\nDeclare the minimum number of CTAs from the kernel\u2019s grid to be mapped to a single multiprocessor\n\n(SM).\n\nNotes\n\nOptimizations based on .minnctapersm need either .maxntid or .reqntid to be specified as\n\nwell.\n\nIf the total number of threads on a single SM resulting from .minnctapersm and .maxntid /\n\n.reqntid exceed maximum number of threads supported by an SM then directive .minnctapersm\n\nwill be ignored.\n\nIn PTX ISA version 2.1 or higher, a warning is generated if .minnctapersm is specified without\n\nspecifying either .maxntid or .reqntid.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0 as a replacement for .maxnctapersm.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxntid 256 .minnctapersm 4 { ... }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-minnctapersm" }; case "mov": return { "html": "For more information, visit mov , mov .

Data Movement and Conversion Instructions: mov

\n\n\n

Set a register variable with the value of a register variable or an immediate value. Take the\nnon-generic address of a variable in global, local, or shared state space.

Syntax

mov.type  d, a;\nmov.type  d, sreg;\nmov.type  d, avar;       // get address of variable\nmov.type  d, avar+imm;   // get address of variable with offset\nmov.u32   d, fname;      // get address of device function\nmov.u64   d, fname;      // get address of device function\nmov.u32   d, kernel;     // get address of entry function\nmov.u64   d, kernel;     // get address of entry function\n\n.type = { .pred,\n          .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64,\n                .f32, .f64 };\n

Description

Write register d with the value of a.

Operand a may be a register, special register, variable with optional offset in an addressable\nmemory space, or function name.

For variables declared in .const, .global, .local, and .shared state spaces, mov\nplaces the non-generic address of the variable (i.e., the address of the variable in its state\nspace) into the destination register. The generic address of a variable in const, global,\nlocal, or shared state space may be generated by first taking the address within the state\nspace with mov and then converting it to a generic address using the cvta instruction;\nalternately, the generic address of a variable declared in const, global, local, or\nshared state space may be taken directly using the cvta instruction.

Note that if the address of a device function parameter is moved to a register, the parameter will\nbe copied onto the stack and the address will be in the local state space.

Semantics

d = a;\nd = sreg;\nd = &avar;        // address is non-generic; i.e., within the variable's declared state space\nd = &avar+imm;\n

Notes

Although only predicate and bit-size types are required, we include the arithmetic types for the\nprogrammer\u2019s convenience: their use enhances program readability and allows additional type\nchecking.
When moving address of a kernel or a device function, only .u32 or .u64 instruction types\nare allowed. However, if a signed type is used, it is not treated as a compilation error. The\ncompiler issues a warning in this case.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Taking the address of kernel entry functions requires PTX ISA version 3.1 or later. Kernel function\naddresses should only be used in the context of CUDA Dynamic Parallelism system calls. See the CUDA\nDynamic Parallelism Programming Guide for details.

Target ISA Notes

mov.f64 requires sm_13 or higher.

Taking the address of kernel entry functions requires sm_35 or higher.

Examples

mov.f32  d,a;\nmov.u16  u,v;\nmov.f32  k,0.1;\nmov.u32  ptr, A;        // move address of A into ptr\nmov.u32  ptr, A[5];     // move address of A[5] into ptr\nmov.u32  ptr, A+20;     // move address with offset into ptr\nmov.u32  addr, myFunc;  // get address of device function 'myFunc'\nmov.u64  kptr, main;    // get address of entry function 'main'\n

Data Movement and Conversion Instructions: mov

\n\n\n

Move vector-to-scalar (pack) or scalar-to-vector (unpack).

Syntax

mov.type  d, a;\n\n.type = { .b16, .b32, .b64 };\n

Description

Write scalar register d with the packed value of vector register a, or write vector register\nd with the unpacked values from scalar register a.

When destination operand d is a vector register, the sink symbol '_' may be used for one or\nmore elements provided that at least one element is a scalar register.

For bit-size types, mov may be used to pack vector elements into a scalar register or unpack\nsub-fields of a scalar register into a vector. Both the overall size of the vector and the size of\nthe scalar must match the size of the instruction type.

Semantics

// pack two 8-bit elements into .b16\nd = a.x | (a.y << 8)\n// pack four 8-bit elements into .b32\nd = a.x | (a.y << 8)  | (a.z << 16) | (a.w << 24)\n// pack two 16-bit elements into .b32\nd = a.x | (a.y << 16)\n// pack four 16-bit elements into .b64\nd = a.x | (a.y << 16)  | (a.z << 32) | (a.w << 48)\n// pack two 32-bit elements into .b64\nd = a.x | (a.y << 32)\n\n// unpack 8-bit elements from .b16\n{ d.x, d.y } = { a[0..7], a[8..15] }\n// unpack 8-bit elements from .b32\n{ d.x, d.y, d.z, d.w }\n        { a[0..7], a[8..15], a[16..23], a[24..31] }\n\n// unpack 16-bit elements from .b32\n{ d.x, d.y }  = { a[0..15], a[16..31] }\n// unpack 16-bit elements from .b64\n{ d.x, d.y, d.z, d.w } =\n        { a[0..15], a[16..31], a[32..47], a[48..63] }\n\n// unpack 32-bit elements from .b64\n{ d.x, d.y } = { a[0..31], a[32..63] }\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

mov.b32 %r1,{a,b};      // a,b have type .u16\nmov.b64 {lo,hi}, %x;    // %x is a double; lo,hi are .u32\nmov.b32 %r1,{x,y,z,w};  // x,y,z,w have type .b8\nmov.b32 {r,g,b,a},%r1;  // r,g,b,a have type .u8\nmov.b64 {%r1, _}, %x;   // %x is.b64, %r1 is .b32\n

", "tooltip": "=====Data Movement and Conversion Instructions: mov\n\n\n\nSet a register variable with the value of a register variable or an immediate value. Take the\n\nnon-generic address of a variable in global, local, or shared state space.\n\nSyntax\n\nmov.type d, a;\n\nmov.type d, sreg;\n\nmov.type d, avar; // get address of variable\n\nmov.type d, avar+imm; // get address of variable with offset\n\nmov.u32 d, fname; // get address of device function\n\nmov.u64 d, f...\n\n=====Data Movement and Conversion Instructions: mov\n\n\n\nMove vector-to-scalar (pack) or scalar-to-vector (unpack).\n\nSyntax\n\nmov.type d, a;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nWrite scalar register d with the packed value of vector register a, or write vector register\n\nd with the unpacked values from scalar register a.\n\nWhen destination operand d is a vector register, the sink symbol '_' may be used for one or\n\nmore elements provided that at least one... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov" }; case "mul": return { "html": "For more information, visit mul(fp) , mul(fp16) , mul(int) .

Floating Point Instructions: mul

\n\n\n

Multiply two values.

Syntax

mul{.rnd}{.ftz}{.sat}.f32  d, a, b;\nmul{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Compute the product of two values.

Semantics

d = a * b;\n

Notes

For floating-point multiplication, all operands must be the same size.

Rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

The default value of rounding modifier is .rn. Note that a mul instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A mul instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, mul/add and mul/sub sequences with no rounding modifiers may be\noptimized to use fused-multiply-add instructions on the target device.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

mul.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

mul.f64 supports subnormal numbers.

mul.f32 flushes subnormal inputs and results to sign-preserving zero.

Saturation modifier:

mul.sat.f32 clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

mul.f32 supported on all target architectures.

mul.f64 requires sm_13 or higher.

Rounding modifiers have the following target requirements:

.rn, .rz

available for all targets

.rm, .rp

for mul.f64, requires sm_13 or higher.

for mul.f32, requires sm_20 or higher.

Examples

mul.ftz.f32 circumf,radius,pi  // a single-precision multiply\n

Half Precision Floating Point Instructions: mul

\n\n\n

Multiply two values.

Syntax

mul{.rnd}{.ftz}{.sat}.f16   d, a, b;\nmul{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nmul{.rnd}.bf16   d, a, b;\nmul{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n

Description

Performs multiplication and writes the resulting value into a destination register.

For .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then multiplied in parallel to produce .f16x2 or .bf16x2\nresult in destination.

Semantics

if (type == f16 || type == bf16) {\n    d = a * b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = fA[i] * fB[i];\n    }\n}\n

Notes

Rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n

The default value of rounding modifier is .rn. Note that a mul instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A mul instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, mul/add and mul/sub sequences with no rounding modifiers may\nbe optimized to use fused-multiply-add instructions on the target device.

Subnormal numbers:: By default, subnormal numbers are supported.\nmul.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n
Saturation modifier:: mul.sat.{f16, f16x2} clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.
\n

PTX ISA Notes

Introduced in PTX ISA version 4.2.

mul{.rnd}.bf16 and mul{.rnd}.bf16x2 introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_53 or higher.

mul{.rnd}.bf16 and mul{.rnd}.bf16x2 requires sm_90 or higher.

Examples

// scalar f16 multiplications\nmul.f16        d0, a0, b0;\nmul.rn.f16     d1, a1, b1;\nmul.bf16       bd0, ba0, bb0;\nmul.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 multiplication\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nmul.f16x2  p3, p1, p2;   // SIMD f16x2 multiplication\n\n// SIMD bf16 multiplication\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nmul.bf16x2  p6, p4, p5;       // SIMD bf16x2 multiplication\n\n// SIMD fp16 multiplication\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nmul.f16x2       f2, f0, f1;     // SIMD f16x2 multiplication\n\n// SIMD bf16 multiplication\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nmul.bf16x2      f5, f3, f4;      // SIMD bf16x2 multiplication\n

Integer Arithmetic Instructions: mul

\n\n\n

Multiply two values.

Syntax

mul.mode.type  d, a, b;\n\n.mode = { .hi, .lo, .wide };\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Compute the product of two values.

Semantics

t = a * b;\nn = bitwidth of type;\nd = t;            // for .wide\nd = t<2n-1..n>;   // for .hi variant\nd = t<n-1..0>;    // for .lo variant\n

Notes

The type of the operation represents the types of the a and b operands. If .hi or\n.lo is specified, then d is the same size as a and b, and either the upper or lower\nhalf of the result is written to the destination register. If .wide is specified, then d is\ntwice as wide as a and b to receive the full result of the multiplication.

The .wide suffix is supported only for 16- and 32-bit integer types.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

mul.wide.s16 fa,fxs,fys;   // 16*16 bits yields 32 bits\nmul.lo.s16 fa,fxs,fys;     // 16*16 bits, save only the low 16 bits\nmul.wide.s32 z,x,y;        // 32*32 bits, creates 64 bit result\n

", "tooltip": "=====Floating Point Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul{.rnd}{.ftz}{.sat}.f32 d, a, b;\n\nmul{.rnd}.f64 d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute the product of two values.\n\nSemantics\n\nd = a * b;\n\nNotes\n\nFor floating-point multiplication, all operands must be the same size.\n\nRounding modifiers:\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds toward...\n\n=====Half Precision Floating Point Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul{.rnd}{.ftz}{.sat}.f16 d, a, b;\n\nmul{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nmul{.rnd}.bf16 d, a, b;\n\nmul{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms multiplication and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then mul...\n\n=====Integer Arithmetic Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul.mode.type d, a, b;\n\n.mode = { .hi, .lo, .wide };\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nCompute the product of two values.\n\nSemantics\n\nt = a * b;\n\nn = bitwidth of type;\n\nd = t; // for .wide\n\nd = t<2n-1..n>; // for .hi variant\n\nd = t; // for .lo variant\n\nNotes\n\nThe type of the operation represents the types of the a and ... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul" }; case "mul24": return { "html": "For more information, visit mul24(int) .

Integer Arithmetic Instructions: mul24

\n\n\n

Multiply two 24-bit integer values.

Syntax

mul24.mode.type  d, a, b;\n\n.mode = { .hi, .lo };\n.type = { .u32, .s32 };\n

Description

Compute the product of two 24-bit integer values held in 32-bit source registers, and return either\nthe high or low 32-bits of the 48-bit result.

Semantics

t = a * b;\nd = t<47..16>;    // for .hi variant\nd = t<31..0>;     // for .lo variant\n

Notes

Integer multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.

mul24.hi performs a 24x24-bit multiply and returns the high 32 bits of the 48-bit result.

mul24.lo performs a 24x24-bit multiply and returns the low 32 bits of the 48-bit result.

All operands are of the same type and size.

mul24.hi may be less efficient on machines without hardware support for 24-bit multiply.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

mul24.lo.s32 d,a,b;   // low 32-bits of 24x24-bit signed multiply.\n

", "tooltip": "Multiply two 24-bit integer values.\n\nSyntax\n\nmul24.mode.type d, a, b;\n\n.mode = { .hi, .lo };\n\n.type = { .u32, .s32 };\n\nDescription\n\nCompute the product of two 24-bit integer values held in 32-bit source registers, and return either\n\nthe high or low 32-bits of the 48-bit result.\n\nSemantics\n\nt = a * b;\n\nd = t<47..16>; // for .hi variant\n\nd = t<31..0>; // for .lo variant\n\nNotes\n\nInteger multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.\n\nmul24.hi performs a 24x24-bit multiply and returns the high 32 bits of the 48-bit result.\n\nmul24.lo performs a 24x24-bit multiply and returns the low 32 bits of the 48-bit result.\n\nAll operands are of the same type and size.\n\nmul24.hi may be less efficient on machines without hardware support for 24-bit multiply.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmul24.lo.s32 d,a,b; // low 32-bits of 24x24-bit signed multiply.\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul24" }; case "multimem": return { "html": "For more information, visit multimem.ld_reduce , multimem.red , multimem.st .

Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red

\n\n\n

Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith ld, st or any other memory operations results in undefined behavior.

Refer to CUDA programming guide for creation and management of the multimem addresses.

multimem.ld_reduce, multimem.st, multimem.red

Perform memory operations on the multimem address.

Syntax

// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n

Description

Instruction multimem.ld_reduce performs the following operations:

load operation on the multimem address a, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address a,
reduction operation specified by .op on the multiple data loaded from the multimem address\na.

The result of the reduction operation in returned in register d.

Instruction multimem.st performs a store operation of the input operand b to all the memory\nlocations pointed to by the multimem address a.

Instruction multimem.red performs a reduction operation on all the memory locations pointed to\nby the multimem address a, with operand b.

Instruction multimem.ld_reduce performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the multimem.red perform reduction\non all the memory locations that the multimem address points to.

Address operand a must be a multimem address. Otherwise, the behavior is undefined. Supported\naddressing modes for operand a and alignment requirements are described in Addresses as Operands.

If no state space is specified then Generic Addressing is\nused. If the address specified by a does not fall within the address window of .global state\nspace then the behavior is undefined.

For floating-point type multi- operations, the size of the specified type along with .vec must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of .vec and type are\nallowed. Type .f64 cannot be used with .vec qualifier.

The following table describes the valid combinations of .op and base type:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

op	Base type
`.add`	\n `.u32`, `.u64`, `.s32` \n `.f16`, `.f16x2`, `.bf16`, `.bf16x2` \n `.f32`, `.f64` \n \n
`.and`, `.or`, `.xor`	`.b32`, `.b64`
`.min`, `.max`	\n `.u32`, `.s32`, `.u64`, `.s644` \n `.f16`, `.f16x2`, `.bf16`, `.bf16x2` \n \n

Optional qualifiers .ldsem, .stsem and .redsem specify the memory synchronizing effect\nof the multimem.ld_reduce, multimem.st and multimem.red respectively, as described in\nMemory Consistency Model. If explicit semantics qualifiers\nare not specified, then multimem.ld_reduce and multimem.st default to .weak and\nmultimem.red defaults to .relaxed.

The optional .scope qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in Memory Consistency Model. If the .scope qualifier is not specified for\nmultimem.red then .sys scope is assumed by default.

PTX ISA Notes

Introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_90 or higher.

Examples

multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\n

Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red

\n\n\n

Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith ld, st or any other memory operations results in undefined behavior.

Refer to CUDA programming guide for creation and management of the multimem addresses.

multimem.ld_reduce, multimem.st, multimem.red

Perform memory operations on the multimem address.

Syntax

// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n

Description

Instruction multimem.ld_reduce performs the following operations:

load operation on the multimem address a, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address a,
reduction operation specified by .op on the multiple data loaded from the multimem address\na.

The result of the reduction operation in returned in register d.

Instruction multimem.st performs a store operation of the input operand b to all the memory\nlocations pointed to by the multimem address a.

Instruction multimem.red performs a reduction operation on all the memory locations pointed to\nby the multimem address a, with operand b.

Address operand a must be a multimem address. Otherwise, the behavior is undefined. Supported\naddressing modes for operand a and alignment requirements are described in Addresses as Operands.

If no state space is specified then Generic Addressing is\nused. If the address specified by a does not fall within the address window of .global state\nspace then the behavior is undefined.

The following table describes the valid combinations of .op and base type:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

op	Base type
`.add`	\n `.u32`, `.u64`, `.s32` \n `.f16`, `.f16x2`, `.bf16`, `.bf16x2` \n `.f32`, `.f64` \n \n
`.and`, `.or`, `.xor`	`.b32`, `.b64`
`.min`, `.max`	\n `.u32`, `.s32`, `.u64`, `.s644` \n `.f16`, `.f16x2`, `.bf16`, `.bf16x2` \n \n

PTX ISA Notes

Introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_90 or higher.

Examples

multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\n

Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red

\n\n\n

Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith ld, st or any other memory operations results in undefined behavior.

Refer to CUDA programming guide for creation and management of the multimem addresses.

multimem.ld_reduce, multimem.st, multimem.red

Perform memory operations on the multimem address.

Syntax

// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n

Description

Instruction multimem.ld_reduce performs the following operations:

load operation on the multimem address a, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address a,
reduction operation specified by .op on the multiple data loaded from the multimem address\na.

The result of the reduction operation in returned in register d.

Instruction multimem.st performs a store operation of the input operand b to all the memory\nlocations pointed to by the multimem address a.

Instruction multimem.red performs a reduction operation on all the memory locations pointed to\nby the multimem address a, with operand b.

Address operand a must be a multimem address. Otherwise, the behavior is undefined. Supported\naddressing modes for operand a and alignment requirements are described in Addresses as Operands.

If no state space is specified then Generic Addressing is\nused. If the address specified by a does not fall within the address window of .global state\nspace then the behavior is undefined.

The following table describes the valid combinations of .op and base type:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

op	Base type
`.add`	\n `.u32`, `.u64`, `.s32` \n `.f16`, `.f16x2`, `.bf16`, `.bf16x2` \n `.f32`, `.f64` \n \n
`.and`, `.or`, `.xor`	`.b32`, `.b64`
`.min`, `.max`	\n `.u32`, `.s32`, `.u64`, `.s644` \n `.f16`, `.f16x2`, `.bf16`, `.bf16x2` \n \n

PTX ISA Notes

Introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_90 or higher.

Examples

multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\n

", "tooltip": "locations which the multimem address points to.\n\nMultimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\n\nwith ld, st or any other memory operations results in undefined behavior.\n\nRefer to CUDA programming guide for creation and management of the multimem addresses.\n\nmultimem.ld_reduce, multimem.st, multimem.red\n\nPerform memory operations on the multimem address.\n\nSyntax\n\n// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type d, [a];\n\nmultimem.st{.stsem}{.scope}{.ss}.type [a], b;\n\nmultimem.red{.redsem}{.scope}{.ss}.op.type [a], b;\n\n.ss = { .global }\n\n.ldsem = { .weak, .relaxed, .acquire }\n\n.stsem = { .weak, .relaxed, .release }\n\n.redsem = { .relaxed, .release }\n\n.scope = { .cta, .cluster, .gpu, .sys }\n\n.op = { .min, .max, .add, .and, .or, .xor }\n\n.type = { .b32, .b64, .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type d, [a];\n\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type [a], b;\n\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type [a], b;\n\n.ss = { .global }\n\n.ldsem = { .weak, .relaxed, .acquire }\n\n.stsem = { .weak, .relaxed, .release }\n\n.redsem = { .relaxed, .release }\n\n.scope = { .cta, .cluster, .gpu, .sys }\n\n.op = { .min, .max, .add }\n\n.redop = { .add }\n\n.vec = { .v2, .v4, .v8 }\n\n.type= { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n\nDescription\n\nInstruction multimem.ld_reduce performs the following operations:\n\nload operation on the multimem address a, which involves loading of data from all of the\n\nmultiple memory locations pointed to by the multimem address a,\n\nreduction operation specified by .op on the multiple data loaded from the multimem address\n\na.\n\nThe result of the reduction operation in returned in register d.\n\nInstruction multimem.st performs a store operation of the input operand b to all the memory\n\nlocations pointed to by the multimem address a.\n\nInstruction multimem.red performs a reduction operation on all the memory locations pointed to\n\nby the multimem address a, with operand b.\n\nInstruction multimem.ld_reduce performs reduction on the values loaded from all the memory\n\nlocations that the multimem address points to. In contrast, the multimem.red perform reduction\n\non all the memory locations that the multimem address points to.\n\nAddress operand a must be a multimem address. Otherwise, the behavior is undefined. Supported\n\naddressing modes for operand a and alignment requirements are described in Addresses as Operands.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the address specified by a does not fall within the address window of .global state\n\nspace then the behavior is undefined.\n\nFor floating-point type multi- operations, the size of the specified type along with .vec must\n\nequal either 32-bits or 64-bits or 128-bits. No other combinations of .vec and type are\n\nallowed. Type .f64 cannot be used with .vec qualifier.\n\nThe following table describes the valid combinations of .op and base type:\n\n\n\nop\n\nBase type\n\n.add\n\n.u32, .u64, .s32\n\n.f16, .f16x2, .bf16, .bf16x2\n\n.f32, .f64\n\n.and, .or, .xor\n\n.b32, .b64\n\n.min, .max\n\n.u32, .s32, .u64, .s644\n\n.f16, .f16x2, .bf16, .bf16x2\n\n\n\nOptional qualifiers .ldsem, .stsem and .redsem specify the memory synchronizing effect\n\nof the multimem.ld_reduce, multimem.st and multimem.red respectively, as described in\n\nMemory Consistency Model. If explicit semantics qualifiers\n\nare not specified, then multimem.ld_reduce and multimem.st default to .weak and\n\nmultimem.red defaults to .relaxed.\n\nThe optional .scope qualifier specifies the set of threads that can directly observe the memory\n\nsynchronizing effect of this operation, as described in Memory Consistency Model. If the .scope qualifier is not specified for\n\nmultimem.red then .sys scope is assumed by default.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.1.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmultimem.ld_reduce.and.b32 va ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red" }; case "nanosleep": return { "html": "For more information, visit nanosleep .

Miscellaneous Instructions: nanosleep

\n\n\n

Suspend the thread for an approximate delay given in nanoseconds.

Syntax

nanosleep.u32 t;\n

Description

Suspends the thread for a sleep duration approximately close to the delay t, specified in\nnanoseconds. t may be a register or an immediate value.

The sleep duration is approximated, but guaranteed to be in the interval [0, 2*t]. The maximum\nsleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual\nthreads within a warp such that all sleeping threads in the warp wake up together.

PTX ISA Notes

nanosleep introduced in PTX ISA 6.3.

Target ISA Notes

nanosleep requires sm_70 or higher.

Examples

.reg .b32 r;\n.reg .pred p;\n\nnanosleep.u32 r;\nnanosleep.u32 42;\n@p nanosleep.u32 r;\n

", "tooltip": "Suspend the thread for an approximate delay given in nanoseconds.\n\nSyntax\n\nnanosleep.u32 t;\n\nDescription\n\nSuspends the thread for a sleep duration approximately close to the delay t, specified in\n\nnanoseconds. t may be a register or an immediate value.\n\nThe sleep duration is approximated, but guaranteed to be in the interval [0, 2*t]. The maximum\n\nsleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual\n\nthreads within a warp such that all sleeping threads in the warp wake up together.\n\nPTX ISA Notes\n\nnanosleep introduced in PTX ISA 6.3.\n\nTarget ISA Notes\n\nnanosleep requires sm_70 or higher.\n\nExamples\n\n.reg .b32 r;\n\n.reg .pred p;\n\nnanosleep.u32 r;\n\nnanosleep.u32 42;\n\n@p nanosleep.u32 r;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep" }; case "nclusterid": return { "html": "For more information, visit nclusterid .

Special Registers: %nclusterid

\n\n\n

Number of cluster identifiers per grid.

Syntax (predefined)

.sreg .v4 .u32 %nclusterid;\n.sreg .u32 %nclusterid.x, %nclusterid.y, %nclusterid.z;\n

Description

A predefined, read-only special register initialized with the number of clusters in each grid\ndimension.

The %nclusterid special register contains a 3D grid shape vector that holds the grid dimensions\nin terms of clusters. The fourth element is unused and always returns zero.

Refer to the Cuda Programming Guide for details on the maximum values of %nclusterid.{x,y,z}.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.reg .b32 %r<2>;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %nclusterid.x;\nmov.u32     %r1, %nclusterid.z;\nmov.v4.u32  %rx, %nclusterid;\n

", "tooltip": "Number of cluster identifiers per grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %nclusterid;\n\n.sreg .u32 %nclusterid.x, %nclusterid.y, %nclusterid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the number of clusters in each grid\n\ndimension.\n\nThe %nclusterid special register contains a 3D grid shape vector that holds the grid dimensions\n\nin terms of clusters. The fourth element is unused and always returns zero.\n\nRefer to the Cuda Programming Guide for details on the maximum values of %nclusterid.{x,y,z}.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32 %r0, %nclusterid.x;\n\nmov.u32 %r1, %nclusterid.z;\n\nmov.v4.u32 %rx, %nclusterid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid" }; case "nctaid": return { "html": "For more information, visit nctaid .

Special Registers: %nctaid

\n\n\n

Number of CTA ids per grid.

Syntax (predefined)

.sreg .v4 .u32 %nctaid                      // Grid shape vector\n.sreg .u32 %nctaid.x,%nctaid.y,%nctaid.z;   // Grid dimensions\n

Description

A predefined, read-only special register initialized with the number of CTAs in each grid\ndimension. The %nctaid special register contains a 3D grid shape vector, with each element\nhaving a value of at least 1. The fourth element is unused and always returns zero.

Maximum values of %nctaid.{x,y,z} are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

.target architecture	%nctaid.x	%nctaid.y	%nctaid.z
`sm_1x`, `sm_20`	65535	65535	65535
`sm_3x`, `sm_5x`, `sm_6x`, `sm_7x`,\n`sm_8x`, `sm_9x`	2³¹ -1	65535	65535

PTX ISA Notes

Introduced in PTX ISA version 1.0 with type .v4.u16.

Redefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n%nctaid.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u32  %r0,%nctaid.x;\nmov.u16  %rh,%nctaid.x;     // legacy code\n

", "tooltip": "Number of CTA ids per grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %nctaid // Grid shape vector\n\n.sreg .u32 %nctaid.x,%nctaid.y,%nctaid.z; // Grid dimensions\n\nDescription\n\nA predefined, read-only special register initialized with the number of CTAs in each grid\n\ndimension. The %nctaid special register contains a 3D grid shape vector, with each element\n\nhaving a value of at least 1. The fourth element is unused and always returns zero.\n\nMaximum values of %nctaid.{x,y,z} are as follows:\n\n\n\n\n\n.target architecture\n\n%nctaid.x\n\n%nctaid.y\n\n%nctaid.z\n\nsm_1x, sm_20\n\n65535\n\n65535\n\n65535\n\nsm_3x, sm_5x, sm_6x, sm_7x,\n\nsm_8x, sm_9x\n\n231 -1\n\n65535\n\n65535\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%nctaid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32 %r0,%nctaid.x;\n\nmov.u16 %rh,%nctaid.x; // legacy code\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nctaid" }; case "neg": return { "html": "For more information, visit neg(fp) , neg(fp16) , neg(int) .

Floating Point Instructions: neg

\n\n\n

Arithmetic negate.

Syntax

neg{.ftz}.f32  d, a;\nneg.f64        d, a;\n

Description

Negate the sign of a and store the result in d.

Semantics

d = -a;\n

Notes

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

neg.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

neg.f64 supports subnormal numbers.

neg.f32 flushes subnormal inputs and results to sign-preserving zero.

NaN inputs yield an unspecified NaN. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

neg.f32 supported on all target architectures.

neg.f64 requires sm_13 or higher.

Examples

neg.ftz.f32  x,f0;\n

Half Precision Floating Point Instructions: neg

\n\n\n

Arithmetic negate.

Syntax

neg{.ftz}.f16    d, a;\nneg{.ftz}.f16x2  d, a;\nneg.bf16         d, a;\nneg.bf16x2       d, a;\n

Description

Negate the sign of a and store the result in d.

For .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\nfrom the source operand. Half-word operands are then negated in parallel to produce .f16x2 or\n.bf16x2 result in destination.

For .f16 instruction type, operands d and a have .f16 or .b16 type. For\n.f16x2 instruction type, operands d and a have .b32 type. For .bf16 instruction\ntype, operands d and a have .b16 type. For .bf16x2 instruction type, operands d\nand a have .b32 type.

Semantics

if (type == f16 || type == bf16) {\n    d = -a;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = -fA[i];\n    }\n}\n

Notes

Subnormal numbers:: By default, subnormal numbers are supported.\nneg.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n

NaN inputs yield an unspecified NaN. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.

PTX ISA Notes

Introduced in PTX ISA version 6.0.

neg.bf16 and neg.bf16x2 introduced in PTX ISA 7.0.

Target ISA Notes

Requires sm_53 or higher.

neg.bf16 and neg.bf16x2 requires architecture sm_80 or higher.

Examples

neg.ftz.f16  x,f0;\nneg.bf16     x,b0;\nneg.bf16x2   x1,b1;\n

Integer Arithmetic Instructions: neg

\n\n\n

Arithmetic negate.

Syntax

neg.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n

Description

Negate the sign of a and store the result in d.

Semantics

d = -a;\n

Notes

Only for signed integers.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

neg.s32  r0,a;\n

", "tooltip": "=====Floating Point Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg{.ftz}.f32 d, a;\n\nneg.f64 d, a;\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nSemantics\n\nd = -a;\n\nNotes\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nneg.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xneg.f64 supports subnormal numbers.\n\nneg.f32 flushes subnormal inputs and results to sign-preserving...\n\n=====Half Precision Floating Point Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg{.ftz}.f16 d, a;\n\nneg{.ftz}.f16x2 d, a;\n\nneg.bf16 d, a;\n\nneg.bf16x2 d, a;\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\n\nfrom the source operand. Half-word operands are then negated in parallel to produce .f16x2 or\n\n.bf16x2 result in destination.\n\nFor .f...\n\n=====Integer Arithmetic Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg.type d, a;\n\n.type = { .s16, .s32, .s64 };\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nSemantics\n\nd = -a;\n\nNotes\n\nOnly for signed integers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nneg.s32 r0,a;\n\n... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg" }; case "noreturn": return { "html": "For more information, visit noreturn .

Performance-Tuning Directives: .noreturn

\n\n\n

Indicate that the function does not return to its caller function.

Syntax

.noreturn\n

Description

Indicate that the function does not return to its caller function.

Semantics

An optional .noreturn directive indicates that the function does not return to caller\nfunction. .noreturn directive can only be specified on device functions and must appear between\na .func directive and its body.

The directive cannot be specified on functions which have return parameters.

If a function with .noreturn directive returns to the caller function at runtime, then the\nbehavior is undefined.

PTX ISA Notes

Introduced in PTX ISA version 6.4.

Target ISA Notes

Requires sm_30 or higher.

Examples

.func foo .noreturn { ... }\n

", "tooltip": "Indicate that the function does not return to its caller function.\n\nSyntax\n\n.noreturn\n\nDescription\n\nIndicate that the function does not return to its caller function.\n\nSemantics\n\nAn optional .noreturn directive indicates that the function does not return to caller\n\nfunction. .noreturn directive can only be specified on device functions and must appear between\n\na .func directive and its body.\n\nThe directive cannot be specified on functions which have return parameters.\n\nIf a function with .noreturn directive returns to the caller function at runtime, then the\n\nbehavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.4.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\n.func foo .noreturn { ... }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-noreturn" }; case "not": return { "html": "For more information, visit not .

Logic and Shift Instructions: not

\n\n\n

Bitwise negation; one\u2019s complement.

Syntax

not.type d, a;\n\n.type = { .pred, .b16, .b32, .b64 };\n

Description

Invert the bits in a.

Semantics

d = ~a;\n

Notes

The size of the operands must match, but not necessarily the type.

Allowed types include predicates.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

not.b32  mask,mask;\nnot.pred  p,q;\n

", "tooltip": "Bitwise negation; one\u2019s complement.\n\nSyntax\n\nnot.type d, a;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nInvert the bits in a.\n\nSemantics\n\nd = ~a;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicates.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nnot.b32 mask,mask;\n\nnot.pred p,q;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not" }; case "nsmid": return { "html": "For more information, visit nsmid .

Special Registers: %nsmid

\n\n\n

Number of SM identifiers.

Syntax (predefined)

.sreg .u32 %nsmid;\n

Description

A predefined, read-only special register that returns the maximum number of SM identifiers. The SM\nidentifier numbering is not guaranteed to be contiguous, so %nsmid may be larger than the\nphysical number of SMs in the device.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%nsmid requires sm_20 or higher.

Examples

mov.u32  %r, %nsmid;\n

", "tooltip": "Number of SM identifiers.\n\nSyntax (predefined)\n\n.sreg .u32 %nsmid;\n\nDescription\n\nA predefined, read-only special register that returns the maximum number of SM identifiers. The SM\n\nidentifier numbering is not guaranteed to be contiguous, so %nsmid may be larger than the\n\nphysical number of SMs in the device.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%nsmid requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %nsmid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nsmid" }; case "ntid": return { "html": "For more information, visit ntid .

Special Registers: %ntid

\n\n\n

Number of thread IDs per CTA.

Syntax (predefined)

.sreg .v4 .u32 %ntid;                   // CTA shape vector\n.sreg .u32 %ntid.x, %ntid.y, %ntid.z;   // CTA dimensions\n

Description

A predefined, read-only special register initialized with the number of thread ids in each CTA\ndimension. The %ntid special register contains a 3D CTA shape vector that holds the CTA\ndimensions. CTA dimensions are non-zero; the fourth element is unused and always returns zero. The\ntotal number of threads in a CTA is (%ntid.x * %ntid.y * %ntid.z).

%ntid.y == %ntid.z == 1 in 1D CTAs.\n%ntid.z ==1 in 2D CTAs.\n

Maximum values of %ntid.{x,y,z} are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

.target architecture	%ntid.x	%ntid.y	%ntid.z
`sm_1x`	512	512	64
`sm_20`, `sm_3x`, `sm_5x`, `sm_6x`,\n`sm_7x`, `sm_8x`, `sm_9x`	1024	1024	64

PTX ISA Notes

Introduced in PTX ISA version 1.0 with type .v4.u16.

Redefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n%ntid.

Target ISA Notes

Supported on all target architectures.

Examples

// compute unified thread id for 2D CTA\nmov.u32  %r0,%tid.x;\nmov.u32  %h1,%tid.y;\nmov.u32  %h2,%ntid.x;\nmad.u32  %r0,%h1,%h2,%r0;\n\nmov.u16  %rh,%ntid.x;      // legacy code\n

", "tooltip": "Number of thread IDs per CTA.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %ntid; // CTA shape vector\n\n.sreg .u32 %ntid.x, %ntid.y, %ntid.z; // CTA dimensions\n\nDescription\n\nA predefined, read-only special register initialized with the number of thread ids in each CTA\n\ndimension. The %ntid special register contains a 3D CTA shape vector that holds the CTA\n\ndimensions. CTA dimensions are non-zero; the fourth element is unused and always returns zero. The\n\ntotal number of threads in a CTA is (%ntid.x * %ntid.y * %ntid.z).\n\n%ntid.y == %ntid.z == 1 in 1D CTAs.\n\n%ntid.z ==1 in 2D CTAs.\n\nMaximum values of %ntid.{x,y,z} are as follows:\n\n\n\n\n\n.target architecture\n\n%ntid.x\n\n%ntid.y\n\n%ntid.z\n\nsm_1x\n\n512\n\n512\n\n64\n\nsm_20, sm_3x, sm_5x, sm_6x,\n\nsm_7x, sm_8x, sm_9x\n\n1024\n\n1024\n\n64\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%ntid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n// compute unified thread id for 2D CTA\n\nmov.u32 %r0,%tid.x;\n\nmov.u32 %h1,%tid.y;\n\nmov.u32 %h2,%ntid.x;\n\nmad.u32 %r0,%h1,%h2,%r0;\n\nmov.u16 %rh,%ntid.x; // legacy code\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ntid" }; case "nwarpid": return { "html": "For more information, visit nwarpid .

Special Registers: %nwarpid

\n\n\n

Number of warp identifiers.

Syntax (predefined)

.sreg .u32 %nwarpid;\n

Description

A predefined, read-only special register that returns the maximum number of warp identifiers.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

%nwarpid requires sm_20 or higher.

Examples

mov.u32  %r, %nwarpid;\n

", "tooltip": "Number of warp identifiers.\n\nSyntax (predefined)\n\n.sreg .u32 %nwarpid;\n\nDescription\n\nA predefined, read-only special register that returns the maximum number of warp identifiers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%nwarpid requires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %nwarpid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nwarpid" }; case "or": return { "html": "For more information, visit or .

Logic and Shift Instructions: or

\n\n\n

Biwise OR.

Syntax

or.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n

Description

Compute the bit-wise or operation for the bits in a and b.

Semantics

d = a | b;\n

Notes

The size of the operands must match, but not necessarily the type.

Allowed types include predicate registers.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

or.b32  mask mask,0x00010001\nor.pred  p,q,r;\n

", "tooltip": "Biwise OR.\n\nSyntax\n\nor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise or operation for the bits in a and b.\n\nSemantics\n\nd = a | b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nor.b32 mask mask,0x00010001\n\nor.pred p,q,r;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or" }; case "pm0": return { "html": "For more information, visit pm0..%pm7 .

Special Registers: %pm0..%pm7

\n\n\n

Performance monitoring counters.

Syntax (predefined)

.sreg .u32 %pm<8>;\n

Description

Special registers %pm0..%pm7 are unsigned 32-bit read-only performance monitor counters. Their\nbehavior is currently undefined.

PTX ISA Notes

%pm0..%pm3 introduced in PTX ISA version 1.3.

%pm4..%pm7 introduced in PTX ISA version 3.0.

Target ISA Notes

%pm0..%pm3 supported on all target architectures.

%pm4..%pm7 require sm_20 or higher.

Examples

mov.u32  r1,%pm0;\nmov.u32  r1,%pm7;\n

", "tooltip": "Performance monitoring counters.\n\nSyntax (predefined)\n\n.sreg .u32 %pm<8>;\n\nDescription\n\nSpecial registers %pm0..%pm7 are unsigned 32-bit read-only performance monitor counters. Their\n\nbehavior is currently undefined.\n\nPTX ISA Notes\n\n%pm0..%pm3 introduced in PTX ISA version 1.3.\n\n%pm4..%pm7 introduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\n%pm0..%pm3 supported on all target architectures.\n\n%pm4..%pm7 require sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%pm0;\n\nmov.u32 r1,%pm7;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-pm7" }; case "pm0_64": return { "html": "For more information, visit pm0_64..%pm7_64 .

Special Registers: %pm0_64..%pm7_64

\n\n\n

64 bit Performance monitoring counters.

Syntax (predefined)

.sreg .u64 %pm0_64;\n.sreg .u64 %pm1_64;\n.sreg .u64 %pm2_64;\n.sreg .u64 %pm3_64;\n.sreg .u64 %pm4_64;\n.sreg .u64 %pm5_64;\n.sreg .u64 %pm6_64;\n.sreg .u64 %pm7_64;\n

Description

Special registers %pm0_64..%pm7_64 are unsigned 64-bit read-only performance monitor\ncounters. Their behavior is currently undefined.

Notes

The lower 32bits of %pm0_64..%pm7_64 are identical to %pm0..%pm7.

PTX ISA Notes

%pm0_64..%pm7_64 introduced in PTX ISA version 4.0.

Target ISA Notes

%pm0_64..%pm7_64 require sm_50 or higher.

Examples

mov.u32  r1,%pm0_64;\nmov.u32  r1,%pm7_64;\n

", "tooltip": "64 bit Performance monitoring counters.\n\nSyntax (predefined)\n\n.sreg .u64 %pm0_64;\n\n.sreg .u64 %pm1_64;\n\n.sreg .u64 %pm2_64;\n\n.sreg .u64 %pm3_64;\n\n.sreg .u64 %pm4_64;\n\n.sreg .u64 %pm5_64;\n\n.sreg .u64 %pm6_64;\n\n.sreg .u64 %pm7_64;\n\nDescription\n\nSpecial registers %pm0_64..%pm7_64 are unsigned 64-bit read-only performance monitor\n\ncounters. Their behavior is currently undefined.\n\nNotes\n\nThe lower 32bits of %pm0_64..%pm7_64 are identical to %pm0..%pm7.\n\nPTX ISA Notes\n\n%pm0_64..%pm7_64 introduced in PTX ISA version 4.0.\n\nTarget ISA Notes\n\n%pm0_64..%pm7_64 require sm_50 or higher.\n\nExamples\n\nmov.u32 r1,%pm0_64;\n\nmov.u32 r1,%pm7_64;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-64-pm7-64" }; case "pmevent": return { "html": "For more information, visit pmevent .

Miscellaneous Instructions: pmevent

\n\n\n

Trigger one or more Performance Monitor events.

Syntax

pmevent       a;    // trigger a single performance monitor event\npmevent.mask  a;    // trigger one or more performance monitor events\n

Description

Triggers one or more of a fixed number of performance monitor events, with event index or mask\nspecified by immediate operand a.

pmevent (without modifier .mask) triggers a single performance monitor event indexed by\nimmediate operand a, in the range 0..15.

pmevent.mask triggers one or more of the performance monitor events. Each bit in the 16-bit\nimmediate operand a controls an event.

Programmatic performance moniter events may be combined with other hardware events using Boolean\nfunctions to increment one of the four performance counters. The relationship between events and\ncounters is programmed via API calls from the host.

Notes

Currently, there are sixteen performance monitor events, numbered 0 through 15.

PTX ISA Notes

pmevent introduced in PTX ISA version 1.4.

pmevent.mask introduced in PTX ISA version 3.0.

Target ISA Notes

pmevent supported on all target architectures.

pmevent.mask requires sm_20 or higher.

Examples

    pmevent      1;\n@p  pmevent      7;\n@q  pmevent.mask 0xff;\n

", "tooltip": "Trigger one or more Performance Monitor events.\n\nSyntax\n\npmevent a; // trigger a single performance monitor event\n\npmevent.mask a; // trigger one or more performance monitor events\n\nDescription\n\nTriggers one or more of a fixed number of performance monitor events, with event index or mask\n\nspecified by immediate operand a.\n\npmevent (without modifier .mask) triggers a single performance monitor event indexed by\n\nimmediate operand a, in the range 0..15.\n\npmevent.mask triggers one or more of the performance monitor events. Each bit in the 16-bit\n\nimmediate operand a controls an event.\n\nProgrammatic performance moniter events may be combined with other hardware events using Boolean\n\nfunctions to increment one of the four performance counters. The relationship between events and\n\ncounters is programmed via API calls from the host.\n\nNotes\n\nCurrently, there are sixteen performance monitor events, numbered 0 through 15.\n\nPTX ISA Notes\n\npmevent introduced in PTX ISA version 1.4.\n\npmevent.mask introduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\npmevent supported on all target architectures.\n\npmevent.mask requires sm_20 or higher.\n\nExamples\n\n pmevent 1;\n\n@p pmevent 7;\n\n@q pmevent.mask 0xff;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent" }; case "popc": return { "html": "For more information, visit popc(int) .

Integer Arithmetic Instructions: popc

\n\n\n

Population count.

Syntax

popc.type  d, a;\n\n.type = { .b32, .b64 };\n

Description

Count the number of one bits in a and place the resulting population count in 32-bit\ndestination register d. Operand a has the instruction type and destination d has type\n.u32.

Semantics

.u32  d = 0;\nwhile (a != 0) {\n   if (a & 0x1)  d++;\n   a = a >> 1;\n}\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

popc requires sm_20 or higher.

Examples

popc.b32  d, a;\npopc.b64  cnt, X;  // cnt is .u32\n

", "tooltip": "Population count.\n\nSyntax\n\npopc.type d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nCount the number of one bits in a and place the resulting population count in 32-bit\n\ndestination register d. Operand a has the instruction type and destination d has type\n\n.u32.\n\nSemantics\n\n.u32 d = 0;\n\nwhile (a != 0) {\n\n if (a & 0x1) d++;\n\n a = a >> 1;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\npopc requires sm_20 or higher.\n\nExamples\n\npopc.b32 d, a;\n\npopc.b64 cnt, X; // cnt is .u32\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc" }; case "pragma": return { "html": "For more information, visit pragma .

Performance-Tuning Directives: .pragma

\n\n\n

Pass directives to PTX backend compiler.

Syntax

.pragma list-of-strings ;\n

Description

Pass module-scoped, entry-scoped, or statement-level directives to the PTX backend compiler.

The .pragma directive may occur at module-scope, at entry-scope, or at statement-level.

Semantics

The interpretation of .pragma directive strings is implementation-specific and has no impact on\nPTX semantics. See Descriptions of .pragma Strings for\ndescriptions of the pragma strings defined in ptxas.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

Supported on all target architectures.

Examples

.pragma \"nounroll\";    // disable unrolling in backend\n\n// disable unrolling for current kernel\n.entry foo .pragma \"nounroll\"; { ... }\n

", "tooltip": "Pass directives to PTX backend compiler.\n\nSyntax\n\n.pragma list-of-strings ;\n\nDescription\n\nPass module-scoped, entry-scoped, or statement-level directives to the PTX backend compiler.\n\nThe .pragma directive may occur at module-scope, at entry-scope, or at statement-level.\n\nSemantics\n\nThe interpretation of .pragma directive strings is implementation-specific and has no impact on\n\nPTX semantics. See Descriptions of .pragma Strings for\n\ndescriptions of the pragma strings defined in ptxas.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.pragma \"nounroll\"; // disable unrolling in backend\n\n// disable unrolling for current kernel\n\n.entry foo .pragma \"nounroll\"; { ... }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-pragma" }; case "prefetch": return { "html": "For more information, visit prefetch .

Data Movement and Conversion Instructions: prefetch, prefetchu

\n\n\n

Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\nstate space.

Syntax

prefetch{.space}.level                    [a];   // prefetch to data cache\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n.level =                    { .L1, .L2 };\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n.tensormap_space =          { .const, .param };\n

Description

The prefetch instruction brings the cache line containing the specified address in global or\nlocal memory state space into the specified cache level.

If the .tensormap qualifier is specified then the prefetch instruction brings the cache line\ncontaining the specified address in the .const or .param memory state space for subsequent\nuse by the cp.async.bulk.tensor instruction.

If no state space is given, the prefetch uses Generic Addressing.

Optionally, the eviction priority to be applied on the prefetched cache line can be specified by the\nmodifier .level::eviction_priority.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

The prefetchu instruction brings the cache line containing the specified generic address into\nthe specified uniform cache level.

A prefetch to a shared memory location performs no operation.

A prefetch into the uniform cache requires a generic address, and no operation occurs if the\naddress maps to a const, local, or shared memory location.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Support for .level::eviction_priority qualifier introduced in PTX ISA version 7.4.

Support for the .tensormap qualifier is introduced in PTX ISA version 8.0.

Target ISA Notes

prefetch and prefetchu require sm_20 or higher.

Support for .level::eviction_priority qualifier requires sm_80 or higher.

Support for the .tensormap qualifier requires sm_90 or higher.

Examples

prefetch.global.L1             [ptr];\nprefetch.global.L2::evict_last [ptr];\nprefetchu.L1  [addr];\nprefetch.global.tensormap      [ptr];\n

", "tooltip": "Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\n\nstate space.\n\nSyntax\n\nprefetch{.space}.level [a]; // prefetch to data cache\n\nprefetch.global.level::eviction_priority [a]; // prefetch to data cache\n\nprefetchu.L1 [a]; // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a]; // prefetch the tensormap\n\n.space = { .global, .local };\n\n.level = { .L1, .L2 };\n\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n\n.tensormap_space = { .const, .param };\n\nDescription\n\nThe prefetch instruction brings the cache line containing the specified address in global or\n\nlocal memory state space into the specified cache level.\n\nIf the .tensormap qualifier is specified then the prefetch instruction brings the cache line\n\ncontaining the specified address in the .const or .param memory state space for subsequent\n\nuse by the cp.async.bulk.tensor instruction.\n\nIf no state space is given, the prefetch uses Generic Addressing.\n\nOptionally, the eviction priority to be applied on the prefetched cache line can be specified by the\n\nmodifier .level::eviction_priority.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nThe prefetchu instruction brings the cache line containing the specified generic address into\n\nthe specified uniform cache level.\n\nA prefetch to a shared memory location performs no operation.\n\nA prefetch into the uniform cache requires a generic address, and no operation occurs if the\n\naddress maps to a const, local, or shared memory location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nSupport for .level::eviction_priority qualifier introduced in PTX ISA version 7.4.\n\nSupport for the .tensormap qualifier is introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nprefetch and prefetchu require sm_20 or higher.\n\nSupport for .level::eviction_priority qualifier requires sm_80 or higher.\n\nSupport for the .tensormap qualifier requires sm_90 or higher.\n\nExamples\n\nprefetch.global.L1 [ptr];\n\nprefetch.global.L2::evict_last [ptr];\n\nprefetchu.L1 [addr];\n\nprefetch.global.tensormap [ptr];\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu" }; case "prefetchu": return { "html": "For more information, visit prefetchu .

Data Movement and Conversion Instructions: prefetch, prefetchu

\n\n\n

Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\nstate space.

Syntax

prefetch{.space}.level                    [a];   // prefetch to data cache\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n.level =                    { .L1, .L2 };\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n.tensormap_space =          { .const, .param };\n

Description

The prefetch instruction brings the cache line containing the specified address in global or\nlocal memory state space into the specified cache level.

If no state space is given, the prefetch uses Generic Addressing.

Optionally, the eviction priority to be applied on the prefetched cache line can be specified by the\nmodifier .level::eviction_priority.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

The prefetchu instruction brings the cache line containing the specified generic address into\nthe specified uniform cache level.

A prefetch to a shared memory location performs no operation.

A prefetch into the uniform cache requires a generic address, and no operation occurs if the\naddress maps to a const, local, or shared memory location.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Support for .level::eviction_priority qualifier introduced in PTX ISA version 7.4.

Support for the .tensormap qualifier is introduced in PTX ISA version 8.0.

Target ISA Notes

prefetch and prefetchu require sm_20 or higher.

Support for .level::eviction_priority qualifier requires sm_80 or higher.

Support for the .tensormap qualifier requires sm_90 or higher.

Examples

prefetch.global.L1             [ptr];\nprefetch.global.L2::evict_last [ptr];\nprefetchu.L1  [addr];\nprefetch.global.tensormap      [ptr];\n

", "tooltip": "Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\n\nstate space.\n\nSyntax\n\nprefetch{.space}.level [a]; // prefetch to data cache\n\nprefetch.global.level::eviction_priority [a]; // prefetch to data cache\n\nprefetchu.L1 [a]; // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a]; // prefetch the tensormap\n\n.space = { .global, .local };\n\n.level = { .L1, .L2 };\n\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n\n.tensormap_space = { .const, .param };\n\nDescription\n\nThe prefetch instruction brings the cache line containing the specified address in global or\n\nlocal memory state space into the specified cache level.\n\nIf the .tensormap qualifier is specified then the prefetch instruction brings the cache line\n\ncontaining the specified address in the .const or .param memory state space for subsequent\n\nuse by the cp.async.bulk.tensor instruction.\n\nIf no state space is given, the prefetch uses Generic Addressing.\n\nOptionally, the eviction priority to be applied on the prefetched cache line can be specified by the\n\nmodifier .level::eviction_priority.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nThe prefetchu instruction brings the cache line containing the specified generic address into\n\nthe specified uniform cache level.\n\nA prefetch to a shared memory location performs no operation.\n\nA prefetch into the uniform cache requires a generic address, and no operation occurs if the\n\naddress maps to a const, local, or shared memory location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nSupport for .level::eviction_priority qualifier introduced in PTX ISA version 7.4.\n\nSupport for the .tensormap qualifier is introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nprefetch and prefetchu require sm_20 or higher.\n\nSupport for .level::eviction_priority qualifier requires sm_80 or higher.\n\nSupport for the .tensormap qualifier requires sm_90 or higher.\n\nExamples\n\nprefetch.global.L1 [ptr];\n\nprefetch.global.L2::evict_last [ptr];\n\nprefetchu.L1 [addr];\n\nprefetch.global.tensormap [ptr];\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu" }; case "prmt": return { "html": "For more information, visit prmt .

Data Movement and Conversion Instructions: prmt

\n\n\n

Permute bytes from register pair.

Syntax

prmt.b32{.mode}  d, a, b, c;\n\n.mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };\n

Description

Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination\nregister.

In the generic form (no mode specified), the permute control consists of four 4-bit selection\nvalues. The bytes in the two source registers are numbered from 0 to 7: {b, a} = {{b7, b6, b5,\nb4}, {b3, b2, b1, b0}}. For each byte in the target register, a 4-bit selection value is defined.

The 3 lsbs of the selection value specify which of the 8 source bytes should be moved into the\ntarget position. The msb defines if the byte value should be copied, or if the sign (msb of the\nbyte) should be replicated over all 8 bits of the target position (sign extend of the byte value);\nmsb=0 means copy the literal value; msb=1 means replicate the sign. Note that the sign\nextension is only performed as part of generic form.

Thus, the four 4-bit values fully specify an arbitrary byte permute, as a 16b permute code.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

default mode	`d.b3` \n source select \n	`d.b2` \n source select \n	`d.b1` \n source select \n	`d.b0` \n source select \n
index	`c[15:12]`	`c[11:8]`	`c[7:4]`	`c[3:0]`

default mode

d.b3

source select

d.b2

source select

d.b1

source select

d.b0

source select

index

c[15:12]

c[11:8]

c[7:4]

c[3:0]

The more specialized form of the permute control uses the two lsb\u2019s of operand c (which is\ntypically an address pointer) to control the byte extraction.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

mode	selector \n `c[1:0]` \n	`d.b3` \n source \n	`d.b2` \n source \n	`d.b1` \n source \n	`d.b0` \n source \n
`f4e` (forward 4 extract)	0	3	2	1	0
	1	4	3	2	1
	2	5	4	3	2
	3	6	5	4	3
`b4e` (backward 4 extract)	0	5	6	7	0
	1	6	7	0	1
	2	7	0	1	2
	3	0	1	2	3
`rc8` (replicate 8)	0	0	0	0	0
	1	1	1	1	1
	2	2	2	2	2
	3	3	3	3	3
`ecl` (edge clamp left)	0	3	2	1	0
	1	3	2	1	1
	2	3	2	2	2
	3	3	3	3	3
`ecr` (edge clamp right)	0	0	0	0	0
	1	1	1	1	0
	2	2	2	1	0
	3	3	2	1	0
`rc16` (replicate 16)	0	1	0	1	0
	1	3	2	3	2
	2	1	0	1	0
	3	3	2	3	2

Semantics

tmp64 = (b<<32) | a;  // create 8 byte source\n\nif ( ! mode ) {\n   ctl[0] = (c >>  0) & 0xf;\n   ctl[1] = (c >>  4) & 0xf;\n   ctl[2] = (c >>  8) & 0xf;\n   ctl[3] = (c >> 12) & 0xf;\n} else {\n   ctl[0] = ctl[1] = ctl[2] = ctl[3] = (c >>  0) & 0x3;\n}\n\ntmp[07:00] = ReadByte( mode, ctl[0], tmp64 );\ntmp[15:08] = ReadByte( mode, ctl[1], tmp64 );\ntmp[23:16] = ReadByte( mode, ctl[2], tmp64 );\ntmp[31:24] = ReadByte( mode, ctl[3], tmp64 );\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

prmt requires sm_20 or higher.

Examples

prmt.b32      r1, r2, r3, r4;\nprmt.b32.f4e  r1, r2, r3, r4;\n

", "tooltip": "Permute bytes from register pair.\n\nSyntax\n\nprmt.b32{.mode} d, a, b, c;\n\n.mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };\n\nDescription\n\nPick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination\n\nregister.\n\nIn the generic form (no mode specified), the permute control consists of four 4-bit selection\n\nvalues. The bytes in the two source registers are numbered from 0 to 7: {b, a} = {{b7, b6, b5,\n\nb4}, {b3, b2, b1, b0}}. For each byte in the target register, a 4-bit selection value is defined.\n\nThe 3 lsbs of the selection value specify which of the 8 source bytes should be moved into the\n\ntarget position. The msb defines if the byte value should be copied, or if the sign (msb of the\n\nbyte) should be replicated over all 8 bits of the target position (sign extend of the byte value);\n\nmsb=0 means copy the literal value; msb=1 means replicate the sign. Note that the sign\n\nextension is only performed as part of generic form.\n\nThus, the four 4-bit values fully specify an arbitrary byte permute, as a 16b permute code.\n\n\n\n\n\ndefault mode\n\nd.b3\n\nsource select\n\nd.b2\n\nsource select\n\nd.b1\n\nsource select\n\nd.b0\n\nsource select\n\n\n\nindex\n\nc[15:12]\n\nc[11:8]\n\nc[7:4]\n\nc[3:0]\n\nThe more specialized form of the permute control uses the two lsb\u2019s of operand c (which is\n\ntypically an address pointer) to control the byte extraction.\n\n\n\n\n\nmode\n\nselector\n\nc[1:0]\n\nd.b3\n\nsource\n\nd.b2\n\nsource\n\nd.b1\n\nsource\n\nd.b0\n\nsource\n\n\n\nf4e (forward 4 extract)\n\n0\n\n3\n\n2\n\n1\n\n0\n\n1\n\n4\n\n3\n\n2\n\n1\n\n2\n\n5\n\n4\n\n3\n\n2\n\n3\n\n6\n\n5\n\n4\n\n3\n\nb4e (backward 4 extract)\n\n0\n\n5\n\n6\n\n7\n\n0\n\n1\n\n6\n\n7\n\n0\n\n1\n\n2\n\n7\n\n0\n\n1\n\n2\n\n3\n\n0\n\n1\n\n2\n\n3\n\nrc8 (replicate 8)\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n1\n\n2\n\n2\n\n2\n\n2\n\n2\n\n3\n\n3\n\n3\n\n3\n\n3\n\necl (edge clamp left)\n\n0\n\n3\n\n2\n\n1\n\n0\n\n1\n\n3\n\n2\n\n1\n\n1\n\n2\n\n3\n\n2\n\n2\n\n2\n\n3\n\n3\n\n3\n\n3\n\n3\n\necr (edge clamp right)\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n0\n\n2\n\n2\n\n2\n\n1\n\n0\n\n3\n\n3\n\n2\n\n1\n\n0\n\nrc16 (replicate 16)\n\n0\n\n1\n\n0\n\n1\n\n0\n\n1\n\n3\n\n2\n\n3\n\n2\n\n2\n\n1\n\n0\n\n1\n\n0\n\n3\n\n3\n\n2\n\n3\n\n2\n\nSemantics\n\ntmp64 = (b<<32) | a; // create 8 byte source\n\nif ( ! mode ) {\n\n ctl[0] = (c >> 0) & 0xf;\n\n ctl[1] = (c >> 4) & 0xf;\n\n ctl[2] = (c >> 8) & 0xf;\n\n ctl[3] = (c >> 12) & 0xf;\n\n} else {\n\n ctl[0] = ctl[1] = ctl[2] = ctl[3] = (c >> 0) & 0x3;\n\n}\n\ntmp[07:00] = ReadByte( mode, ctl[0], tmp64 );\n\ntmp[15:08] = ReadByte( mode, ctl[1], tmp64 );\n\ntmp[23:16] = ReadByte( mode, ctl[2], tmp64 );\n\ntmp[31:24] = ReadByte( mode, ctl[3], tmp64 );\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nprmt requires sm_20 or higher.\n\nExamples\n\nprmt.b32 r1, r2, r3, r4;\n\nprmt.b32.f4e r1, r2, r3, r4;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt" }; case "rcp": return { "html": "For more information, visit rcp(fp) , rcp.approx.ftz.f64(fp) .

Floating Point Instructions: rcp

\n\n\n

Take the reciprocal of a value.

Syntax

rcp.approx{.ftz}.f32  d, a;  // fast, approximate reciprocal\nrcp.rnd{.ftz}.f32     d, a;  // IEEE 754 compliant rounding\nrcp.rnd.f64           d, a;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Compute 1/a, store result in d.

Semantics

d = 1 / a;\n

Notes

Fast, approximate single-precision reciprocal:

rcp.approx.f32 implements a fast approximation to reciprocal. The maximum absolute error is 2^-23.0 over the range 1.0-2.0.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	-0.0
-subnormal	-Inf
-0.0	-Inf
+0.0	+Inf
+subnormal	+Inf
+Inf	+0.0
NaN	NaN

Reciprocal with IEEE 754 compliant rounding:

Rounding modifiers (no default):

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

rcp.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

rcp.f64 supports subnormal numbers.

rcp.f32 flushes subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

rcp.f32 and rcp.f64 introduced in PTX ISA version 1.0. rcp.rn.f64 and explicit modifiers\n.approx and .ftz were introduced in PTX ISA version 1.4. General rounding modifiers were\nadded in PTX ISA version 2.0.

For PTX ISA version 1.4 and later, one of .approx or .rnd is required.

For PTX ISA versions 1.0 through 1.3, rcp.f32 defaults to rcp.approx.ftz.f32, and\nrcp.f64 defaults to rcp.rn.f64.

Target ISA Notes

rcp.approx.f32 supported on all target architectures.

rcp.rnd.f32 requires sm_20 or higher.

rcp.rn.f64 requires sm_13 or higher, or .target map_f64_to_f32.

rcp.{rz,rm,rp}.f64 requires sm_20 or higher.

Examples

rcp.approx.ftz.f32  ri,r;\nrcp.rn.ftz.f32      xi,x;\nrcp.rn.f64          xi,x;\n

Floating Point Instructions: rcp.approx.ftz.f64

\n\n\n

Compute a fast, gross approximation to the reciprocal of a value.

Syntax

rcp.approx.ftz.f64  d, a;\n

Description

Compute a fast, gross approximation to the reciprocal as follows:

extract the most-significant 32 bits of .f64 operand a in 1.11.20 IEEE floating-point\nformat (i.e., ignore the least-significant 32 bits of a),
compute an approximate .f64 reciprocal of this value using the most-significant 20 bits of\nthe mantissa of operand a,
place the resulting 32-bits in 1.11.20 IEEE floating-point format in the most-significant 32-bits\nof destination d,and
zero the least significant 32 mantissa bits of .f64 destination d.

Semantics

tmp = a[63:32]; // upper word of a, 1.11.20 format\nd[63:32] = 1.0 / tmp;\nd[31:0] = 0x00000000;\n

Notes

rcp.approx.ftz.f64 implements a fast, gross approximation to reciprocal.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input a[63:32]	Result d[63:32]
-Inf	-0.0
-subnormal	-Inf
-0.0	-Inf
+0.0	+Inf
+subnormal	+Inf
+Inf	+0.0
NaN	NaN

Input NaNs map to a canonical NaN with encoding 0x7fffffff00000000.

Subnormal inputs and results are flushed to sign-preserving zero.

PTX ISA Notes

rcp.approx.ftz.f64 introduced in PTX ISA version 2.1.

Target ISA Notes

rcp.approx.ftz.f64 requires sm_20 or higher.

Examples

rcp.ftz.f64  xi,x;\n

", "tooltip": "=====Floating Point Instructions: rcp\n\n\n\nTake the reciprocal of a value.\n\nSyntax\n\nrcp.approx{.ftz}.f32 d, a; // fast, approximate reciprocal\n\nrcp.rnd{.ftz}.f32 d, a; // IEEE 754 compliant rounding\n\nrcp.rnd.f64 d, a; // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute 1/a, store result in d.\n\nSemantics\n\nd = 1 / a;\n\nNotes\n\nFast, approximate single-precision reciprocal:\n\nrcp.approx.f32 implements a fas...\n\n=====Floating Point Instructions: rcp.approx.ftz.f64\n\n\n\nCompute a fast, gross approximation to the reciprocal of a value.\n\nSyntax\n\nrcp.approx.ftz.f64 d, a;\n\nDescription\n\nCompute a fast, gross approximation to the reciprocal as follows:\n\nextract the most-significant 32 bits of .f64 operand a in 1.11.20 IEEE floating-point\n\nformat (i.e., ignore the least-significant 32 bits of a),\n\ncompute an approximate .f64 reciprocal of this value using the most-significant... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp" }; case "red": return { "html": "For more information, visit red , red.async .

Parallel Synchronization and Communication Instructions: red

\n\n\n

Reduction operations on global and shared memory.

Syntax

Reduction operation with scalar type:

red{.sem}{.scope}{.space}.op{.level::cache_hint}.type          [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16    [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2  [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16\n                                                      [a], b {, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2\n                                                      [a], b {, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n.sem =                {.relaxed, .release};\n.scope =              {.cta, .cluster, .gpu, .sys};\n\n.op =                 { .and, .or, .xor,\n                        .add, .inc, .dec,\n                        .min, .max };\n.level::cache_hint =  { .L2::cache_hint };\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n

Reduction operation with vector type:

red{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32 [a], b{, cache-policy};\nred{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}. vec_16_bit.half_word_type [a], b{, cache-policy};\nred{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type [a], b {, cache-policy};\n\n.sem =                { .relaxed, .release };\n.scope =              { .cta, .gpu, .sys };\n.op =                 { .add, .min, .max };\n.half_word_type =     { .f16, .bf16 };\n.packed_type =        { .f16x2,.bf16x2 };\n.vec_16_bit =         { .v2, .v4, .v8 }\n.vec_32_bit =         { .v2, .v4 };\n.level::cache_hint =  { .L2::cache_hint }\n

Description

Performs a reduction operation with operand b and the value in location a, and stores the\nresult of the specified operation at location a, overwriting the original value. Operand a\nspecifies a location in the specified state space. If no state space is given, perform the memory\naccesses using Generic Addressing. red with scalar type may\nbe used only with .global and .shared spaces and with generic addressing, where the address\npoints to .global or .shared space. red with vector type may be used only with\n.global space and with generic addressing where the address points to .global space.

For red with vector type, operand b is brace-enclosed vector expressions, size of which is\nequal to the size of vector qualifier.

If no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.

The optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\nConsistency Model. If the .sem qualifier is absent,\n.relaxed is assumed by default.

For red with vector type, the supported combinations of vector qualifier, types and reduction\noperations supported on these combinations are depicted in following table:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Vector qualifier	Types
`.f16`/ `bf16`	`.f16x2`/ `bf16x2`	`.f32`
`.v2`	`.add`, `.min`, `.max`	`.add`, `.min`, `.max`	`.add`
`.v4`	`.add`, `.min`, `.max`	`.add`, `.min`, `.max`	`.add`
`.v8`	`.add`, `.min`, `.max`	Not supported	Not Supported

red instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\nsuch case, the atomicity is guaranteed separately for each of the individual scalar elements; the\nentire red is not guaranteed to be atomic as a single access.

For sm_6x and earlier architectures, red operations on .shared state space do not\nguarantee atomicity with respect to normal store instructions to the same address. It is the\nprogrammer\u2019s responsibility to guarantee correctness of programs that use shared memory reduction\ninstructions, e.g., by inserting barriers between normal stores and reduction operations to a common\naddress, or by using atom.exch to store to locations accessed by other reduction operations.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

The bit-size operations are .and, .or, and .xor.

The integer operations are .add, .inc, .dec, .min, .max. The .inc and\n.dec operations return a result in the range [0..b].

The floating-point operation .add operation rounds to nearest even. Current implementation of\nred.add.f32 on global memory flushes subnormal inputs and results to sign-preserving zero;\nwhereas red.add.f32 on shared memory supports subnormal inputs and results and doesn\u2019t flush\nthem to zero.

red.add.f16, red.add.f16x2, red.add.bf16 and red.add.bf16x2 operation requires the\n.noftz qualifier; it preserves subnormal inputs and results, and does not flush them to zero.

The qualifier .level::cache_hint is only supported for .global state space and for generic\naddressing where the address points to the .global state space.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

Semantics

*a = operation(*a, b);\n\nwhere\n    inc(r, s) = (r >= s) ? 0 : r+1;\n    dec(r, s) = (r==0 || r > s)  ? s : r-1;\n

PTX ISA Notes

Introduced in PTX ISA version 1.2.

red.add.f32 and red.shared.add.u64 introduced in PTX ISA 2.0.

64-bit red.{and,or,xor,min,max} introduced in PTX ISA 3.1.

red.add.f64 introduced in PTX ISA 5.0.

.scope qualifier introduced in PTX ISA 5.0.

.sem qualifier introduced in PTX ISA version 6.0.

red.add.noftz.f16x2 introduced in PTX ISA 6.2.

red.add.noftz.f16 introduced in PTX ISA 6.3.

Per-element atomicity of red.f16x2 clarified in PTX ISA version 6.3, with retrospective effect\nfrom PTX ISA version 6.2

Support for .level::cache_hint qualifier introduced in PTX ISA version 7.4.

red.add.noftz.bf16 and red.add.noftz.bf16x2 introduced in PTX ISA 7.8.

Support for .cluster scope qualifier introduced in PTX ISA version 7.8.

Support for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.

Support for vector types introduced in PTX ISA version 8.1.

Target ISA Notes

red.global requires sm_11 or higher

red.shared requires sm_12 or higher.

red.global.add.u64 requires sm_12 or higher.

red.shared.add.u64 requires sm_20 or higher.

64-bit red.{and,or,xor,min,max} require sm_32 or higher.

red.add.f32 requires sm_20 or higher.

red.add.f64 requires sm_60 or higher.

.scope qualifier requires sm_60 or higher.

.sem qualifier requires sm_70 or higher.

Use of generic addressing requires sm_20 or higher.

red.add.noftz.f16x2 requires sm_60 or higher.

red.add.ftz.f16 requires sm_70 or higher.

Support for .level::cache_hint qualifier requires sm_80 or higher.

red.add.noftz.bf16 and red.add.noftz.bf16x2 require sm_90 or higher.

Support for .cluster scope qualifier requires sm_90 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Support for vector types requires sm_90 or higher.

Examples

red.global.add.s32  [a],1;\nred.shared::cluster.max.u32  [x+4],0;\n@p  red.global.and.b32  [p],my_val;\nred.global.sys.add.u32 [a], 1;\nred.global.acquire.sys.add.u32 [gbl], 1;\nred.add.noftz.f16x2 [a], b;\nred.add.noftz.bf16   [a], hb;\nred.add.noftz.bf16x2 [b], bb;\nred.global.cluster.relaxed.add.u32 [a], 1;\nred.shared::cta.min.u32  [x+4],0;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\nred.global.and.L2::cache_hint.b32 [a], 1, cache-policy;\n\nred.global.v8.f16.add.noftz  [gbl], {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\nred.global.v8.bf16.min.noftz [gbl], {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\nred.global.v2.f16.add.noftz [gbl], {%h0, %h1};\nred.global.v2.bf16.add.noftz [gbl], {%h0, %h1};\nred.global.v4.f16x2.max.noftz [gbl], {%h0, %h1, %h2, %h3};\nred.global.v4.f32.add  [gbl], {%f0, %f1, %f2, %f3};\nred.global.v2.f16x2.max.noftz {%bd0, %bd1}, [g], {%b0, %b1};\nred.global.v2.bf16x2.add.noftz {%bd0, %bd1}, [g], {%b0, %b1};\nred.global.v2.f32.add  {%f0, %f1}, [g], {%f0, %f1};\n

Parallel Synchronization and Communication Instructions: red.async

\n\n\n

Asynchronous reduction operation on shared memory.

Syntax

// Increment and Decrement reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   =                 { .shared::cluster };\n.op   =                 { .inc, .dec };\n.type =                 { .u32 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n\n// MIN and MAX reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   = { .shared::cluster };\n.op   = { .min, .max };\n.type = { .u32, .s32 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n// Bitwise AND, OR and XOR reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   = { .shared::cluster };\n.op   = { .and, .or, .xor };\n.type = { .b32 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n// ADD reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.add.type [a], b, [mbar];\n\n.ss   = { .shared::cluster };\n.type = { .u32, .s32, .u64 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n

Description

red.async is a non-blocking instruction which initiates an asynchronous reduction operation\nspecified by .op, with the operand b and the value at destination shared memory location\nspecified by operand a.

The .inc and .dec operations return a result in the range [0..b].

The modifier .completion_mechanism specifies that upon completion of the asynchronous operation,\ncomplete-tx\noperation, with completeCount argument equal to amount of data stored in bytes, will be\nperformed on the mbarrier object specified by the operand mbar.

Operand a represents destination address and must be a register or of the form register +\nimmOff as described in Addresses as Operands.

The shared memory addresses of destination operand a and the mbarrier object mbar, must\nmeet all of the following conditions:

They Belong to the same CTA.
They are different to the CTA of the executing thread but must be within the same cluster.

Otherwise, the behavior is undefined.

The state space of the address {.ss}, if specified, is applicable to both operands a and\nmbar. If not specified, then Generic Addressing is used for\nboth a and mbar.

With .shared::cluster, if the addresses specified do not fall within the address window of\n.shared::cluster state space, then the behaviour is undefined.

The reduce operation in red.async is treated as a relaxed memory operation and the complete_tx\noperation on the mbarrier has .release semantics at the .cluster scope as described in the\nMemory Consistency Model.

PTX ISA Notes

Introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_90 or higher.

Examples

red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [addr], b, [mbar_addr];\n

", "tooltip": "=====Parallel Synchronization and Communication Instructions: red\n\n\n\nReduction operations on global and shared memory.\n\nSyntax\n\nReduction operation with scalar type:\n\nred{.sem}{.scope}{.space}.op{.level::cache_hint}.type [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16 [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2 [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::...\n\n=====Parallel Synchronization and Communication Instructions: red.async\n\n\n\nAsynchronous reduction operation on shared memory.\n\nSyntax\n\n// Increment and Decrement reductions\n\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss = { .shared::cluster };\n\n.op = { .inc, .dec };\n\n.type = { .u32 };\n\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n// MIN and MAX reductions\n\nred.async.relaxed.clust... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red" }; case "redux": return { "html": "For more information, visit redux.sync .

Parallel Synchronization and Communication Instructions: redux.sync

\n\n\n

Perform reduction operation on the data from each predicated active thread in the thread group.

Syntax

redux.sync.op.type dst, src, membermask;\n.op   = {.add, .min, .max}\n.type = {.u32, .s32}\n\nredux.sync.op.b32 dst, src, membermask;\n.op   = {.and, .or, .xor}\n

Description

redux.sync will cause the executing thread to wait until all non-exited threads corresponding to\nmembermask have executed redux.sync with the same qualifiers and same membermask value\nbefore resuming execution.

Operand membermask specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s laneid.

redux.sync performs a reduction operation .op of the 32 bit source register src across\nall non-exited threads in the membermask. The result of the reduction operation is written to\nthe 32 bit destination register dst.

Reduction operation can be one of the bitwise operation in .and, .or, .xor or arithmetic\noperation in .add, .min , .max.

For the .add operation result is truncated to 32 bits.

The behavior of redux.sync is undefined if the executing thread is not in the membermask.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_80 or higher.

Release Notes

Note that redux.sync applies to threads in a single warp, not across an entire CTA.

Examples

.reg .b32 dst, src, init, mask;\nredux.sync.add.s32 dst, src, 0xff;\nredux.sync.xor.b32 dst, src, mask;\n

", "tooltip": "Perform reduction operation on the data from each predicated active thread in the thread group.\n\nSyntax\n\nredux.sync.op.type dst, src, membermask;\n\n.op = {.add, .min, .max}\n\n.type = {.u32, .s32}\n\nredux.sync.op.b32 dst, src, membermask;\n\n.op = {.and, .or, .xor}\n\nDescription\n\nredux.sync will cause the executing thread to wait until all non-exited threads corresponding to\n\nmembermask have executed redux.sync with the same qualifiers and same membermask value\n\nbefore resuming execution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid.\n\nredux.sync performs a reduction operation .op of the 32 bit source register src across\n\nall non-exited threads in the membermask. The result of the reduction operation is written to\n\nthe 32 bit destination register dst.\n\nReduction operation can be one of the bitwise operation in .and, .or, .xor or arithmetic\n\noperation in .add, .min , .max.\n\nFor the .add operation result is truncated to 32 bits.\n\nThe behavior of redux.sync is undefined if the executing thread is not in the membermask.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.0.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nRelease Notes\n\nNote that redux.sync applies to threads in a single warp, not across an entire CTA.\n\nExamples\n\n.reg .b32 dst, src, init, mask;\n\nredux.sync.add.s32 dst, src, 0xff;\n\nredux.sync.xor.b32 dst, src, mask;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync" }; case "rem": return { "html": "For more information, visit rem(int) .

Integer Arithmetic Instructions: rem

\n\n\n

The remainder of integer division.

Syntax

rem.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Divides a by b, store the remainder in d.

Semantics

d = a % b;\n

Notes

The behavior for negative numbers is machine-dependent and depends on whether divide rounds towards\nzero or negative infinity.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

rem.s32  x,x,8;    // x = x%8;\n

", "tooltip": "The remainder of integer division.\n\nSyntax\n\nrem.type d, a, b;\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nDivides a by b, store the remainder in d.\n\nSemantics\n\nd = a % b;\n\nNotes\n\nThe behavior for negative numbers is machine-dependent and depends on whether divide rounds towards\n\nzero or negative infinity.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nrem.s32 x,x,8; // x = x%8;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem" }; case "reqnctapercluster": return { "html": "For more information, visit reqnctapercluster .

Cluster Dimension Directives: .reqnctapercluster

\n\n\n

Declare the number of CTAs in the cluster.

Syntax

.reqnctapercluster nx\n.reqnctapercluster nx, ny\n.reqnctapercluster nx, ny, nz\n

Description

Set the number of thread blocks (CTAs) in the cluster by specifying the extent of each dimension of\nthe 1D, 2D, or 3D cluster. The total number of CTAs is the product of the number of CTAs in each\ndimension. For kernels with .reqnctapercluster directive specified, runtime will use the\nspecified values for configuring the launch if the same are not specified at launch time.

Semantics

If cluster dimension is explicitly specified at launch time, it should be equal to the values\nspecified in this directive. Specifying a different cluster dimension at launch will result in a\nruntime error or kernel launch failure.

PTX ISA Notes

Introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_90 or higher.

Examples

.entry foo .reqnctapercluster 2         { . . . }\n.entry bar .reqnctapercluster 2, 2, 1   { . . . }\n.entry ker .reqnctapercluster 3, 2      { . . . }\n

", "tooltip": "Declare the number of CTAs in the cluster.\n\nSyntax\n\n.reqnctapercluster nx\n\n.reqnctapercluster nx, ny\n\n.reqnctapercluster nx, ny, nz\n\nDescription\n\nSet the number of thread blocks (CTAs) in the cluster by specifying the extent of each dimension of\n\nthe 1D, 2D, or 3D cluster. The total number of CTAs is the product of the number of CTAs in each\n\ndimension. For kernels with .reqnctapercluster directive specified, runtime will use the\n\nspecified values for configuring the launch if the same are not specified at launch time.\n\nSemantics\n\nIf cluster dimension is explicitly specified at launch time, it should be equal to the values\n\nspecified in this directive. Specifying a different cluster dimension at launch will result in a\n\nruntime error or kernel launch failure.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo .reqnctapercluster 2 { . . . }\n\n.entry bar .reqnctapercluster 2, 2, 1 { . . . }\n\n.entry ker .reqnctapercluster 3, 2 { . . . }\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-reqnctapercluster" }; case "reqntid": return { "html": "For more information, visit reqntid .

Performance-Tuning Directives: .reqntid

\n\n\n

Number of threads in the thread block (CTA).

Syntax

.reqntid nx\n.reqntid nx, ny\n.reqntid nx, ny, nz\n

Description

Declare the number of threads in the thread block (CTA) by specifying the extent of each dimension\nof the 1D, 2D, or 3D CTA. The total number of threads is the product of the number of threads in\neach dimension.

Semantics

The size of each CTA dimension specified in any invocation of the kernel is required to be equal to\nthat specified in this directive. Specifying a different CTA dimension at launch will result in a\nruntime error or kernel launch failure.

Notes

The .reqntid directive cannot be used in conjunction with the .maxntid directive.

PTX ISA Notes

Introduced in PTX ISA version 2.1.

Target ISA Notes

Supported on all target architectures.

Examples

.entry foo .reqntid 256       { ... }  // num threads = 256\n.entry bar .reqntid 16,16,4   { ... }  // num threads = 1024\n

", "tooltip": "Number of threads in the thread block (CTA).\n\nSyntax\n\n.reqntid nx\n\n.reqntid nx, ny\n\n.reqntid nx, ny, nz\n\nDescription\n\nDeclare the number of threads in the thread block (CTA) by specifying the extent of each dimension\n\nof the 1D, 2D, or 3D CTA. The total number of threads is the product of the number of threads in\n\neach dimension.\n\nSemantics\n\nThe size of each CTA dimension specified in any invocation of the kernel is required to be equal to\n\nthat specified in this directive. Specifying a different CTA dimension at launch will result in a\n\nruntime error or kernel launch failure.\n\nNotes\n\nThe .reqntid directive cannot be used in conjunction with the .maxntid directive.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .reqntid 256 { ... } // num threads = 256\n\n.entry bar .reqntid 16,16,4 { ... } // num threads = 1024\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-reqntid" }; case "reserved_smem_offset_<2>": return { "html": "For more information, visit reserved_smem_offset_<2> .

Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2>

\n\n\n

%reserved_smem_offset_begin: Start of the reserved shared memory region.
\n
%reserved_smem_offset_end: End of the reserved shared memory region.
\n
%reserved_smem_offset_cap: Total size of the reserved shared memory region.
\n
%reserved_smem_offset_<2>: Offsets in the reserved shared memory region.
\n

Syntax (predefined)

.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_<2>;\n

Description

These are predefined, read-only special registers containing information about the shared memory\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\nCUDA Programming Guide for details.

PTX ISA Notes

Introduced in PTX ISA version 7.6.

Target ISA Notes

Require sm_80 or higher.

Examples

.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n

", "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin, %reserved_smem_offset_begin;\n\nmov.b32 %reg_end, %reserved_smem_offset_end;\n\nmov.b32 %reg_cap, %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2" }; case "reserved_smem_offset_begin": return { "html": "For more information, visit reserved_smem_offset_begin .

Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2>

\n\n\n

%reserved_smem_offset_begin: Start of the reserved shared memory region.
\n
%reserved_smem_offset_end: End of the reserved shared memory region.
\n
%reserved_smem_offset_cap: Total size of the reserved shared memory region.
\n
%reserved_smem_offset_<2>: Offsets in the reserved shared memory region.
\n

Syntax (predefined)

.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_<2>;\n

Description

PTX ISA Notes

Introduced in PTX ISA version 7.6.

Target ISA Notes

Require sm_80 or higher.

Examples

.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n

", "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin, %reserved_smem_offset_begin;\n\nmov.b32 %reg_end, %reserved_smem_offset_end;\n\nmov.b32 %reg_cap, %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2" }; case "reserved_smem_offset_cap": return { "html": "For more information, visit reserved_smem_offset_cap .

Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2>

\n\n\n

%reserved_smem_offset_begin: Start of the reserved shared memory region.
\n
%reserved_smem_offset_end: End of the reserved shared memory region.
\n
%reserved_smem_offset_cap: Total size of the reserved shared memory region.
\n
%reserved_smem_offset_<2>: Offsets in the reserved shared memory region.
\n

Syntax (predefined)

.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_<2>;\n

Description

PTX ISA Notes

Introduced in PTX ISA version 7.6.

Target ISA Notes

Require sm_80 or higher.

Examples

.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n

", "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin, %reserved_smem_offset_begin;\n\nmov.b32 %reg_end, %reserved_smem_offset_end;\n\nmov.b32 %reg_cap, %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2" }; case "reserved_smem_offset_end": return { "html": "For more information, visit reserved_smem_offset_end .

Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2>

\n\n\n

%reserved_smem_offset_begin: Start of the reserved shared memory region.
\n
%reserved_smem_offset_end: End of the reserved shared memory region.
\n
%reserved_smem_offset_cap: Total size of the reserved shared memory region.
\n
%reserved_smem_offset_<2>: Offsets in the reserved shared memory region.
\n

Syntax (predefined)

.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_<2>;\n

Description

PTX ISA Notes

Introduced in PTX ISA version 7.6.

Target ISA Notes

Require sm_80 or higher.

Examples

.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n

", "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin, %reserved_smem_offset_begin;\n\nmov.b32 %reg_end, %reserved_smem_offset_end;\n\nmov.b32 %reg_cap, %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2" }; case "ret": return { "html": "For more information, visit ret .

Control Flow Instructions: ret

\n\n\n

Return from function to instruction after call.

Syntax

ret{.uni};\n

Description

Return execution to caller\u2019s environment. A divergent return suspends threads until all threads are\nready to return to the caller. This allows multiple divergent ret instructions.

A ret is assumed to be divergent unless the .uni suffix is present, indicating that the\nreturn is guaranteed to be non-divergent.

Any values returned from a function should be moved into the return parameter variables prior to\nexecuting the ret instruction.

A return instruction executed in a top-level entry routine will terminate thread execution.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

    ret;\n@p  ret;\n

", "tooltip": "Return from function to instruction after call.\n\nSyntax\n\nret{.uni};\n\nDescription\n\nReturn execution to caller\u2019s environment. A divergent return suspends threads until all threads are\n\nready to return to the caller. This allows multiple divergent ret instructions.\n\nA ret is assumed to be divergent unless the .uni suffix is present, indicating that the\n\nreturn is guaranteed to be non-divergent.\n\nAny values returned from a function should be moved into the return parameter variables prior to\n\nexecuting the ret instruction.\n\nA return instruction executed in a top-level entry routine will terminate thread execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n ret;\n\n@p ret;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret" }; case "rsqrt": return { "html": "For more information, visit rsqrt(fp) , rsqrt.approx.ftz.f64(fp) .

Floating Point Instructions: rsqrt

\n\n\n

Take the reciprocal of the square root of a value.

Syntax

rsqrt.approx{.ftz}.f32  d, a;\nrsqrt.approx.f64        d, a;\n

Description

Compute 1/sqrt(a) and store the result in d.

Semantics

d = 1/sqrt(a);\n

Notes

rsqrt.approx implements an approximation to the reciprocal square root.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	NaN
-normal	NaN
-subnormal	-Inf
-0.0	-Inf
+0.0	+Inf
+subnormal	+Inf
+Inf	+0.0
NaN	NaN

The maximum absolute error for rsqrt.f32 is 2^-22.4 over the range 1.0-4.0.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

rsqrt.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

rsqrt.f64 supports subnormal numbers.

rsqrt.f32 flushes subnormal inputs and results to sign-preserving zero.

Note that rsqrt.approx.f64 is emulated in software and are relatively slow.

PTX ISA Notes

rsqrt.f32 and rsqrt.f64 were introduced in PTX ISA version 1.0. Explicit modifiers\n.approx and .ftz were introduced in PTX ISA version 1.4.

For PTX ISA version 1.4 and later, the .approx modifier is required.

For PTX ISA versions 1.0 through 1.3, rsqrt.f32 defaults to rsqrt.approx.ftz.f32, and\nrsqrt.f64 defaults to rsqrt.approx.f64.

Target ISA Notes

rsqrt.f32 supported on all target architectures.

rsqrt.f64 requires sm_13 or higher.

Examples

rsqrt.approx.ftz.f32  isr, x;\nrsqrt.approx.f64      ISR, X;\n

Floating Point Instructions: rsqrt.approx.ftz.f64

\n\n\n

Compute an approximation of the square root reciprocal of a value.

Syntax

rsqrt.approx.ftz.f64 d, a;\n

Description

Compute a double-precision (.f64) approximation of the square root reciprocal of a value. The\nleast significant 32 bits of the double-precision (.f64) destination d are all zeros.

Semantics

tmp = a[63:32]; // upper word of a, 1.11.20 format\nd[63:32] = 1.0 / sqrt(tmp);\nd[31:0] = 0x00000000;\n

Notes

rsqrt.approx.ftz.f64 implements a fast approximation of the square root reciprocal of a value.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	NaN
-subnormal	-Inf
-0.0	-Inf
+0.0	+Inf
+subnormal	+Inf
+Inf	+0.0
NaN	NaN

Input NaNs map to a canonical NaN with encoding 0x7fffffff00000000.

Subnormal inputs and results are flushed to sign-preserving zero.

PTX ISA Notes

rsqrt.approx.ftz.f64 introduced in PTX ISA version 4.0.

Target ISA Notes

rsqrt.approx.ftz.f64 requires sm_20 or higher.

Examples

rsqrt.approx.ftz.f64 xi,x;\n

", "tooltip": "=====Floating Point Instructions: rsqrt\n\n\n\nTake the reciprocal of the square root of a value.\n\nSyntax\n\nrsqrt.approx{.ftz}.f32 d, a;\n\nrsqrt.approx.f64 d, a;\n\nDescription\n\nCompute 1/sqrt(a) and store the result in d.\n\nSemantics\n\nd = 1/sqrt(a);\n\nNotes\n\nrsqrt.approx implements an approximation to the reciprocal square root.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-normal\n\nNaN\n\n-subnormal\n\n-Inf\n\n-0.0\n\n-Inf\n\n+0.0\n\n+Inf\n\n+subnormal\n\n+Inf\n\n+Inf\n\n+0.0\n\nNaN\n\nNaN\n\nThe maximum absol...\n\n=====Floating Point Instructions: rsqrt.approx.ftz.f64\n\n\n\nCompute an approximation of the square root reciprocal of a value.\n\nSyntax\n\nrsqrt.approx.ftz.f64 d, a;\n\nDescription\n\nCompute a double-precision (.f64) approximation of the square root reciprocal of a value. The\n\nleast significant 32 bits of the double-precision (.f64) destination d are all zeros.\n\nSemantics\n\ntmp = a[63:32]; // upper word of a, 1.11.20 format\n\nd[63:32] = 1.0 / sqrt(tmp);\n\nd[31:0] = 0x000000... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt" }; case "sad": return { "html": "For more information, visit sad(int) .

Integer Arithmetic Instructions: sad

\n\n\n

Sum of absolute differences.

Syntax

sad.type  d, a, b, c;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Adds the absolute value of a-b to c and writes the resulting value into d.

Semantics

d = c + ((a<b) ? b-a : a-b);\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

sad.s32  d,a,b,c;\nsad.u32  d,a,b,d;  // running sum\n

", "tooltip": "Sum of absolute differences.\n\nSyntax\n\nsad.type d, a, b, c;\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nAdds the absolute value of a-b to c and writes the resulting value into d.\n\nSemantics\n\nd = c + ((asection .

Debugging Directives: .section

\n\n\n

PTX section definition.

Syntax

.section section_name { dwarf-lines }\n\ndwarf-lines have the following formats:\n  .b8    byte-list       // comma-separated list of integers\n                         // in range [-128..255]\n  .b16   int16-list      // comma-separated list of integers\n                         // in range [-2^15..2^16-1]\n  .b32   int32-list      // comma-separated list of integers\n                         // in range [-2^31..2^32-1]\n  label:                 // Define label inside the debug section\n  .b64   int64-list      // comma-separated list of integers\n                         // in range [-2^63..2^64-1]\n  .b32   label\n  .b64   label\n  .b32   label+imm       // a sum of label address plus a constant integer byte\n                         // offset(signed, 32bit)\n  .b64   label+imm       // a sum of label address plus a constant integer byte\n                         // offset(signed, 64bit)\n  .b32   label1-label2   // a difference in label addresses between labels in\n                         // the same dwarf section (32bit)\n  .b64   label3-label4   // a difference in label addresses between labels in\n                         // the same dwarf section (64bit)\n

PTX ISA Notes

Introduced in PTX ISA version 2.0, replaces @@DWARF syntax.

label+imm expression introduced in PTX ISA version 3.2.

Support for .b16 integers in dwarf-lines introduced in PTX ISA version 6.0.

Support for defining label inside the DWARF section is introduced in PTX ISA version 7.2.

label1-label2 expression introduced in PTX ISA version 7.5.

Negative numbers in dwarf lines introduced in PTX ISA version 7.5.

Target ISA Notes

Supported on all target architectures.

Examples

.section .debug_pubnames\n{\n    .b32    LpubNames_end0-LpubNames_begin0\n  LpubNames_begin0:\n    .b8     0x2b, 0x00, 0x00, 0x00, 0x02, 0x00\n    .b32    .debug_info\n  info_label1:\n    .b32    0x000006b5, 0x00000364, 0x61395a5f, 0x5f736f63\n    .b32    0x6e69616d, 0x63613031, 0x6150736f, 0x736d6172\n    .b8     0x00, 0x00, 0x00, 0x00, 0x00\n  LpubNames_end0:\n}\n\n.section .debug_info\n{\n    .b32 11430\n    .b8 2, 0\n    .b32 .debug_abbrev\n    .b8 8, 1, 108, 103, 101, 110, 102, 101, 58, 32, 69, 68, 71, 32, 52, 46, 49\n    .b8 0\n    .b32 3, 37, 176, -99\n    .b32 info_label1\n    .b32 .debug_loc+0x4\n    .b8 -11, 11, 112, 97\n    .b32 info_label1+12\n    .b64 -1\n    .b16 -5, -65535\n}\n

", "tooltip": "PTX section definition.\n\nSyntax\n\n.section section_name { dwarf-lines }\n\ndwarf-lines have the following formats:\n\n .b8 byte-list // comma-separated list of integers\n\n // in range [-128..255]\n\n .b16 int16-list // comma-separated list of integers\n\n // in range [-2^15..2^16-1]\n\n .b32 int32-list // comma-separated list of integers\n\n // in range [-2^31..2^32-1]\n\n label: // Define label inside the debug section\n\n .b64 int64-list // comma-separated list of integers\n\n // in range [-2^63..2^64-1]\n\n .b32 label\n\n .b64 label\n\n .b32 label+imm // a sum of label address plus a constant integer byte\n\n // offset(signed, 32bit)\n\n .b64 label+imm // a sum of label address plus a constant integer byte\n\n // offset(signed, 64bit)\n\n .b32 label1-label2 // a difference in label addresses between labels in\n\n // the same dwarf section (32bit)\n\n .b64 label3-label4 // a difference in label addresses between labels in\n\n // the same dwarf section (64bit)\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0, replaces @@DWARF syntax.\n\nlabel+imm expression introduced in PTX ISA version 3.2.\n\nSupport for .b16 integers in dwarf-lines introduced in PTX ISA version 6.0.\n\nSupport for defining label inside the DWARF section is introduced in PTX ISA version 7.2.\n\nlabel1-label2 expression introduced in PTX ISA version 7.5.\n\nNegative numbers in dwarf lines introduced in PTX ISA version 7.5.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.section .debug_pubnames\n\n{\n\n .b32 LpubNames_end0-LpubNames_begin0\n\n LpubNames_begin0:\n\n .b8 0x2b, 0x00, 0x00, 0x00, 0x02, 0x00\n\n .b32 .debug_info\n\n info_label1:\n\n .b32 0x000006b5, 0x00000364, 0x61395a5f, 0x5f736f63\n\n .b32 0x6e69616d, 0x63613031, 0x6150736f, 0x736d6172\n\n .b8 0x00, 0x00, 0x00, 0x00, 0x00\n\n LpubNames_end0:\n\n}\n\n.section .debug_info\n\n{\n\n .b32 11430\n\n .b8 2, 0\n\n .b32 .debug_abbrev\n\n .b8 8, 1, 108, 103, 101, 110, 102, 101, 58, 32, 69, 68, 71, 32, 52, 46, 49\n\n .b8 0\n\n .b32 3, 37, 176, -99\n\n .b32 info_label1\n\n .b32 .debug_loc+0x4\n\n .b8 -11, 11, 112, 97\n\n .b32 info_label1+12\n\n .b64 -1\n\n .b16 -5, -65535\n\n}\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-section" }; case "selp": return { "html": "For more information, visit selp .

Comparison and Selection Instructions: selp

\n\n\n

Select between source operands, based on the value of the predicate source operand.

Syntax

selp.type d, a, b, c;\n\n.type = { .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64,\n                .f32, .f64 };\n

Description

Conditional selection. If c is True, a is stored in d, b otherwise. Operands\nd, a, and b must be of the same type. Operand c is a predicate.

Semantics

d = (c == 1) ? a : b;\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

selp.f64 requires sm_13 or higher.

Examples

    selp.s32  r0,r,g,p;\n@q  selp.f32  f0,t,x,xp;\n

", "tooltip": "Select between source operands, based on the value of the predicate source operand.\n\nSyntax\n\nselp.type d, a, b, c;\n\n.type = { .b16, .b32, .b64,\n\n .u16, .u32, .u64,\n\n .s16, .s32, .s64,\n\n .f32, .f64 };\n\nDescription\n\nConditional selection. If c is True, a is stored in d, b otherwise. Operands\n\nd, a, and b must be of the same type. Operand c is a predicate.\n\nSemantics\n\nd = (c == 1) ? a : b;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nselp.f64 requires sm_13 or higher.\n\nExamples\n\n selp.s32 r0,r,g,p;\n\n@q selp.f32 f0,t,x,xp;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp" }; case "set": return { "html": "For more information, visit set , set .

Comparison and Selection Instructions: set

\n\n\n

Compare two numeric values with a relational operator, and optionally combine this result with a\npredicate value by applying a Boolean operator.

Syntax

set.CmpOp{.ftz}.dtype.stype         d, a, b;\nset.CmpOp.BoolOp{.ftz}.dtype.stype  d, a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge, lo, ls, hi, hs,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n.dtype  = { .u32, .s32, .f32 };\n.stype  = { .b16, .b32, .b64,\n            .u16, .u32, .u64,\n            .s16, .s32, .s64,\n                  .f32, .f64 };\n

Description

Compares two numeric values and optionally combines the result with another predicate value by\napplying a Boolean operator. If this result is True, 1.0f is written for floating-point\ndestination types, and 0xffffffff is written for integer destination types. Otherwise,\n0x00000000 is written.

Operand dhas type .dtype; operands a and b have type .stype; operand c has\ntype .pred.

Semantics

t = (a CmpOp b) ? 1 : 0;\nif (isFloat(dtype))\n    d = BoolOp(t, c) ? 1.0f : 0x00000000;\nelse\n    d = BoolOp(t, c) ? 0xffffffff : 0x00000000;\n

Integer Notes

The signed and unsigned comparison operators are eq, ne, lt, le, gt, ge.

For unsigned values, the comparison operators lo, ls, hi, and hs for lower,\nlower-or-same, higher, and higher-or-same may be used instead of lt, le, gt, ge,\nrespectively.

The untyped, bit-size comparisons are eq and ne.

Floating Point Notes

The ordered comparisons are eq, ne, lt, le, gt, ge. If either operand is NaN, the result is False.

To aid comparison operations in the presence of NaN values, unordered versions are included:\nequ, neu, ltu, leu, gtu, geu. If both operands are numeric values (not\nNaN), then these comparisons have the same result as their ordered counterparts. If either\noperand is NaN, then the result of these comparisons is True.

num returns True if both operands are numeric values (not NaN), and nan returns\nTrue if either operand is NaN.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

set.ftz.dtype.f32 flushes subnormal inputs to sign-preserving zero.

sm_1x

set.dtype.f64 supports subnormal numbers.

set.dtype.f32 flushes subnormal inputs to sign-preserving zero.

Modifier .ftz applies only to .f32 comparisons.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

set with .f64 source type requires sm_13 or higher.

Examples

@p  set.lt.and.f32.s32  d,a,b,r;\n    set.eq.u32.u32      d,i,n;\n

Half Precision Comparison Instructions: set

\n\n\n

Compare two numeric values with a relational operator, and optionally combine this result with a\npredicate value by applying a Boolean operator.

Syntax

set.CmpOp{.ftz}.f16.stype            d, a, b;\nset.CmpOp.BoolOp{.ftz}.f16.stype     d, a, b, {!}c;\n\nset.CmpOp.bf16.stype                 d, a, b;\nset.CmpOp.BoolOp.bf16.stype          d, a, b, {!}c;\n\nset.CmpOp{.ftz}.dtype.f16            d, a, b;\nset.CmpOp.BoolOp{.ftz}.dtype.f16     d, a, b, {!}c;\n.dtype  = { .u16, .s16, .u32, .s32}\n\nset.CmpOp.dtype.bf16                 d, a, b;\nset.CmpOp.BoolOp.dtype.bf16          d, a, b, {!}c;\n.dtype  = { .u16, .s16, .u32, .s32}\n\nset.CmpOp{.ftz}.dtype.f16x2          d, a, b;\nset.CmpOp.BoolOp{.ftz}.dtype.f16x2   d, a, b, {!}c;\n.dtype  = { .f16x2, .u32, .s32}\n\nset.CmpOp.dtype.bf16x2               d, a, b;\nset.CmpOp.BoolOp.dtype.bf16x2        d, a, b, {!}c;\n.dtype  = { .bf16x2, .u32, .s32}\n\n.CmpOp  = { eq, ne, lt, le, gt, ge,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n.stype  = { .b16, .b32, .b64,\n            .u16, .u32, .u64,\n            .s16, .s32, .s64,\n            .f16, .f32, .f64};\n

Description

Compares two numeric values and optionally combines the result with another predicate value by\napplying a Boolean operator.

Result of this computation is written in destination register in the following way:

If result is True,
\n
- 0xffffffff is written for destination types .u32/.s32.
- 0xffff is written for destination types .u16/.s16.
- 1.0 in target precision floating point format is written for destination type .f16,\n.bf16.
\n
If result is False,
\n
- 0x0 is written for all integer destination types.
- 0.0 in target precision floating point format is written for destination type .f16,\n.bf16.
\n

If the source type is .f16x2 or .bf16x2 then result of individual operations are packed in\nthe 32-bit destination operand.

Operand c has type .pred.

Semantics

if (stype == .f16x2 || stype == .bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    t[0]   = (fA[0] CmpOp fB[0]) ? 1 : 0;\n    t[1]   = (fA[1] CmpOp fB[1]) ? 1 : 0;\n    if (dtype == .f16x2 || stype == .bf16x2) {\n        for (i = 0; i < 2; i++) {\n            d[i] = BoolOp(t[i], c) ? 1.0 : 0.0;\n        }\n    } else {\n        for (i = 0; i < 2; i++) {\n            d[i] = BoolOp(t[i], c) ? 0xffff : 0;\n        }\n    }\n} else if (dtype == .f16 || stype == .bf16) {\n    t = (a CmpOp b) ? 1 : 0;\n    d = BoolOp(t, c) ? 1.0 : 0.0;\n} else  { // Integer destination type\n    trueVal = (isU16(dtype) || isS16(dtype)) ?  0xffff : 0xffffffff;\n    t = (a CmpOp b) ? 1 : 0;\n    d = BoolOp(t, c) ? trueVal : 0;\n}\n

Floating Point Notes

The ordered comparisons are eq, ne, lt, le, gt, ge. If either operand is\nNaN, the result is False.

num returns True if both operands are numeric values (not NaN), and nan returns\nTrue if either operand is NaN.

Subnormal numbers:

By default, subnormal numbers are supported.

When .ftz modifier is specified then subnormal inputs and results are flushed to sign\npreserving zero.

PTX ISA Notes

Introduced in PTX ISA version 4.2.

set.{u16, u32, s16, s32}.f16 and set.{u32, s32}.f16x2 are introduced in PTX ISA version 6.5.

set.{u16, u32, s16, s32}.bf16, set.{u32, s32, bf16x2}.bf16x2,\nset.bf16.{s16,u16,f16,b16,s32,u32,f32,b32,s64,u64,f64,b64} are introduced in PTX ISA version\n7.8.

Target ISA Notes

Requires sm_53 or higher.

set.{u16, u32, s16, s32}.bf16, set.{u32, s32, bf16x2}.bf16x2,\nset.bf16.{s16,u16,f16,b16,s32,u32,f32,b32,s64,u64,f64,b64} require sm_90 or higher.

Examples

set.lt.and.f16.f16  d,a,b,r;\nset.eq.f16x2.f16x2  d,i,n;\nset.eq.u32.f16x2    d,i,n;\nset.lt.and.u16.f16  d,a,b,r;\nset.ltu.or.bf16.f16    d,u,v,s;\nset.equ.bf16x2.bf16x2  d,j,m;\nset.geu.s32.bf16x2     d,j,m;\nset.num.xor.s32.bf16   d,u,v,s;\n

", "tooltip": "Compare two numeric values with a relational operator, and optionally combine this result with a\n\npredicate value by applying a Boolean operator.\n\nSyntax\n\nset.CmpOp{.ftz}. ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set" }; case "setmaxnreg": return { "html": "For more information, visit setmaxnreg .

Miscellaneous Instructions: setmaxnreg

\n\n\n

Hint to change the number of registers owned by the warp.

Syntax

setmaxnreg.action.sync.aligned.u32 imm-reg-count;\n\n.action = { .inc, .dec };\n

Description

setmaxnreg provides a hint to the system to update the maximum number of per-thread registers\nowned by the executing warp to the value specified by the imm-reg-count operand.

Qualifier .dec is used to release extra registers such that the absolute per-thread maximum\nregister count is reduced from its current value to imm-reg-count. Qualifier .inc is used to\nrequest additional registers such that the absolute per-thread maximum register count is increased\nfrom its current value to imm-reg-count.

A pool of available registers is maintained per-CTA. Register adjustments requested by the\nsetmaxnreg instructions are handled by supplying extra registers from this pool to the\nrequesting warp or by releasing extra registers from the requesting warp to this pool, depending\nupon the value of the .action qualifier.

The setmaxnreg.inc instruction blocks the execution until enough registers are available in the\nCTA\u2019s register pool. After the instruction setmaxnreg.inc obtains new registers from the CTA\npool, the initial contents of the new registers are undefined. The new registers must be initialized\nbefore they are used.

The same setmaxnreg instruction must be executed by all warps in a warpgroup. After executing a\nsetmaxnreg instruction, all warps in the warpgroup must synchronize explicitly before\nexecuting subsequent setmaxnreg instructions. If a setmaxnreg instruction is not executed by all\nwarps in the warpgroup, then the behavior is undefined.

Operand imm-reg-count is an integer constant. The value of imm-reg-count must be in the\nrange 24 to 256 (both inclusive) and must be a multiple of 8.

Changes to the register file of the warp always happen at the tail-end of the register file.

The setmaxnreg instruction requires that the kernel has been launched with a valid value of\nmaximum number of per-thread registers specified via the appropriate compilation via the appropriate\ncompile-time option or the appropriate performance tuning directive. Otherwise, the setmaxnreg\ninstruction may have no effect.

When qualifier .dec is specified, the maximum number of per-thread registers owned by the warp\nprior to the execution of setmaxnreg instruction should be greater than or equal to the\nimm-reg-count. Otherwise, the behaviour is undefined.

When qualifier .inc is specified, the maximum number of per-thread registers owned by the warp\nprior to the execution of setmaxnreg instruction should be less than or equal to the\nimm-reg-count. Otherwise, the behaviour is undefined.

The mandatory .sync qualifier indicates that setmaxnreg instruction causes the executing\nthread to wait until all threads in the warp execute the same setmaxnreg instruction before\nresuming execution.

The mandatory .aligned qualifier indicates that all threads in the warpgroup must execute the\nsame setmaxnreg instruction. In conditionally executed code, setmaxnreg instruction should\nonly be used if it is known that all threads in warpgroup evaluate the condition identically,\notherwise the behavior is undefined.

PTX ISA Notes

Introduced in PTX ISA version 8.0.

Target ISA Notes

Requires sm_90a.

Examples

setmaxnreg.dec.sync.aligned.u32 64;\nsetmaxnreg.inc.sync.aligned.u32 192;\n

", "tooltip": "Hint to change the number of registers owned by the warp.\n\nSyntax\n\nsetmaxnreg.action.sync.aligned.u32 imm-reg-count;\n\n.action = { .inc, .dec };\n\nDescription\n\nsetmaxnreg provides a hint to the system to update the maximum number of per-thread registers\n\nowned by the executing warp to the value specified by the imm-reg-count operand.\n\nQualifier .dec is used to release extra registers such that the absolute per-thread maximum\n\nregister count is reduced from its current value to imm-reg-count. Qualifier .inc is used to\n\nrequest additional registers such that the absolute per-thread maximum register count is increased\n\nfrom its current value to imm-reg-count.\n\nA pool of available registers is maintained per-CTA. Register adjustments requested by the\n\nsetmaxnreg instructions are handled by supplying extra registers from this pool to the\n\nrequesting warp or by releasing extra registers from the requesting warp to this pool, depending\n\nupon the value of the .action qualifier.\n\nThe setmaxnreg.inc instruction blocks the execution until enough registers are available in the\n\nCTA\u2019s register pool. After the instruction setmaxnreg.inc obtains new registers from the CTA\n\npool, the initial contents of the new registers are undefined. The new registers must be initialized\n\nbefore they are used.\n\nThe same setmaxnreg instruction must be executed by all warps in a warpgroup. After executing a\n\nsetmaxnreg instruction, all warps in the warpgroup must synchronize explicitly before\n\nexecuting subsequent setmaxnreg instructions. If a setmaxnreg instruction is not executed by all\n\nwarps in the warpgroup, then the behavior is undefined.\n\nOperand imm-reg-count is an integer constant. The value of imm-reg-count must be in the\n\nrange 24 to 256 (both inclusive) and must be a multiple of 8.\n\nChanges to the register file of the warp always happen at the tail-end of the register file.\n\nThe setmaxnreg instruction requires that the kernel has been launched with a valid value of\n\nmaximum number of per-thread registers specified via the appropriate compilation via the appropriate\n\ncompile-time option or the appropriate performance tuning directive. Otherwise, the setmaxnreg\n\ninstruction may have no effect.\n\nWhen qualifier .dec is specified, the maximum number of per-thread registers owned by the warp\n\nprior to the execution of setmaxnreg instruction should be greater than or equal to the\n\nimm-reg-count. Otherwise, the behaviour is undefined.\n\nWhen qualifier .inc is specified, the maximum number of per-thread registers owned by the warp\n\nprior to the execution of setmaxnreg instruction should be less than or equal to the\n\nimm-reg-count. Otherwise, the behaviour is undefined.\n\nThe mandatory .sync qualifier indicates that setmaxnreg instruction causes the executing\n\nthread to wait until all threads in the warp execute the same setmaxnreg instruction before\n\nresuming execution.\n\nThe mandatory .aligned qualifier indicates that all threads in the warpgroup must execute the\n\nsame setmaxnreg instruction. In conditionally executed code, setmaxnreg instruction should\n\nonly be used if it is known that all threads in warpgroup evaluate the condition identically,\n\notherwise the behavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_90a.\n\nExamples\n\nsetmaxnreg.dec.sync.aligned.u32 64;\n\nsetmaxnreg.inc.sync.aligned.u32 192;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg" }; case "setp": return { "html": "For more information, visit setp , setp .

Comparison and Selection Instructions: setp

\n\n\n

Compare\u00a0two numeric values with a relational operator, and (optionally) combine this result with a\npredicate value by applying a Boolean operator.

Syntax

setp.CmpOp{.ftz}.type         p[|q], a, b;\nsetp.CmpOp.BoolOp{.ftz}.type  p[|q], a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge, lo, ls, hi, hs,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n.type   = { .b16, .b32, .b64,\n            .u16, .u32, .u64,\n            .s16, .s32, .s64,\n                  .f32, .f64 };\n

Description

Compares two values and combines the result with another predicate value by applying a Boolean\noperator. This result is written to the first destination operand. A related value computed using\nthe complement of the compare result is written to the second destination operand.

Applies to all numeric types. Operands a and b have type .type; operands p, q,\nand c have type .pred. The sink symbol \u2018_\u2019 may be used in place of any one of the\ndestination operands.

Semantics

t = (a CmpOp b) ? 1 : 0;\np = BoolOp(t, c);\nq = BoolOp(!t, c);\n

Integer Notes

The signed and unsigned comparison operators are eq, ne, lt, le, gt, ge.

For unsigned values, the comparison operators lo, ls, hi, and hs for lower,\nlower-or-same, higher, and higher-or-same may be used instead of lt, le, gt, ge,\nrespectively.

The untyped, bit-size comparisons are eq and ne.

Floating Point Notes

The ordered comparisons are eq, ne, lt, le, gt, ge. If either operand is NaN, the result is False.

num returns True if both operands are numeric values (not NaN), and nan returns\nTrue if either operand is NaN.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

setp.ftz.dtype.f32 flushes subnormal inputs to sign-preserving zero.

sm_1x

setp.dtype.f64 supports subnormal numbers.

setp.dtype.f32 flushes subnormal inputs to sign-preserving zero.

Modifier .ftz applies only to .f32 comparisons.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

setp with .f64 source type requires sm_13 or higher.

Examples

    setp.lt.and.s32  p|q,a,b,r;\n@q  setp.eq.u32      p,i,n;\n

Half Precision Comparison Instructions: setp

\n\n\n

Compare two numeric values with a relational operator, and optionally combine this result with a\npredicate value by applying a Boolean operator.

Syntax

setp.CmpOp{.ftz}.f16           p, a, b;\nsetp.CmpOp.BoolOp{.ftz}.f16    p, a, b, {!}c;\n\nsetp.CmpOp{.ftz}.f16x2         p|q, a, b;\nsetp.CmpOp.BoolOp{.ftz}.f16x2  p|q, a, b, {!}c;\n\nsetp.CmpOp.bf16                p, a, b;\nsetp.CmpOp.BoolOp.bf16         p, a, b, {!}c;\n\nsetp.CmpOp.bf16x2              p|q, a, b;\nsetp.CmpOp.BoolOp.bf16x2       p|q, a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n

Description

Compares two values and combines the result with another predicate value by applying a Boolean\noperator. This result is written to the destination operand.

Operand c, p and q has type .pred.

For instruction type .f16, operands a and b have type .b16 or .f16.

For instruction type .f16x2, operands a and b have type .b32.

For instruction type .bf16, operands a and b have type .b16.

For instruction type .bf16x2, operands a and b have type .b32.

Semantics

if (type == .f16 || type == .bf16) {\n     t = (a CmpOp b) ? 1 : 0;\n     p = BoolOp(t, c);\n} else if (type == .f16x2 || type == .bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    t[0] = (fA[0] CmpOp fB[0]) ? 1 : 0;\n    t[1] = (fA[1] CmpOp fB[1]) ? 1 : 0;\n    p = BoolOp(t[0], c);\n    q = BoolOp(t[1], c);\n}\n

Floating Point Notes

The ordered comparisons are eq, ne, lt, le, gt, ge. If either operand is\nNaN, the result is False.

num returns True if both operands are numeric values (not NaN), and nan returns\nTrue if either operand is NaN.

Subnormal numbers:

By default, subnormal numbers are supported.

setp.ftz.{f16,f16x2} flushes subnormal inputs to sign-preserving zero.

PTX ISA Notes

Introduced in PTX ISA version 4.2.

setp.{bf16/bf16x2} introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_53 or higher.

setp.{bf16/bf16x2} requires sm_90 or higher.

Examples

setp.lt.and.f16x2  p|q,a,b,r;\n@q  setp.eq.f16    p,i,n;\n\nsetp.gt.or.bf16x2  u|v,c,d,s;\n@q  setp.eq.bf16   u,j,m;\n

", "tooltip": "=====Comparison and Selection Instructions: setp\n\n\n\nCompare\u00a0two numeric values with a relational operator, and (optionally) combine this result with a\n\npredicate value by applying a Boolean operator.\n\nSyntax\n\nsetp.CmpOp{.ftz}.type p[|q], a, b;\n\nsetp.CmpOp.BoolOp{.ftz}.type p[|q], a, b, {!}c;\n\n.CmpOp = { eq, ne, lt, le, gt, ge, lo, ls, hi, hs,\n\n equ, neu, ltu, leu, gtu, geu, num, nan };\n\n.BoolOp = { and, or, xor };\n\n.type = { .b16, .b...\n\n=====Half Precision Comparison Instructions: setp\n\n\n\nCompare two numeric values with a relational operator, and optionally combine this result with a\n\npredicate value by applying a Boolean operator.\n\nSyntax\n\nsetp.CmpOp{.ftz}.f16 p, a, b;\n\nsetp.CmpOp.BoolOp{.ftz}.f16 p, a, b, {!}c;\n\nsetp.CmpOp{.ftz}.f16x2 p|q, a, b;\n\nsetp.CmpOp.BoolOp{.ftz}.f16x2 p|q, a, b, {!}c;\n\nsetp.CmpOp.bf16 p, a, b;\n\nsetp.CmpOp.BoolOp.bf16 p... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp" }; case "shf": return { "html": "For more information, visit shf .

Logic and Shift Instructions: shf

\n\n\n

Funnel shift.

Syntax

shf.l.mode.b32  d, a, b, c;  // left shift\nshf.r.mode.b32  d, a, b, c;  // right shift\n\n.mode = { .clamp, .wrap };\n

Description

Shift the 64-bit value formed by concatenating operands a and b left or right by the amount\nspecified by the unsigned 32-bit value in c. Operand b holds bits 63:32 and operand a\nholds bits 31:0 of the 64-bit source value. The source is shifted left or right by the clamped\nor wrapped value in c. For shf.l, the most-significant 32-bits of the result are written\ninto d; for shf.r, the least-significant 32-bits of the result are written into d.

Semantics

u32  n = (.mode == .clamp) ? min(c, 32) : c & 0x1f;\nswitch (shf.dir) {  // shift concatenation of [b, a]\n    case shf.l:     // extract 32 msbs\n           u32  d = (b << n)      | (a >> (32-n));\n    case shf.r:     // extract 32 lsbs\n           u32  d = (b << (32-n)) | (a >> n);\n}\n

Notes

Use funnel shift for multi-word shift operations and for rotate operations. The shift amount is\nlimited to the range 0..32 in clamp mode and 0..31 in wrap mode, so shifting multi-word\nvalues by distances greater than 32 requires first moving 32-bit words, then using shf to shift\nthe remaining 0..31 distance.

To shift data sizes greater than 64 bits to the right, use repeated shf.r instructions applied\nto adjacent words, operating from least-significant word towards most-significant word. At each\nstep, a single word of the shifted result is computed. The most-significant word of the result is\ncomputed using a shr.{u32,s32} instruction, which zero or sign fills based on the instruction\ntype.

To shift data sizes greater than 64 bits to the left, use repeated shf.l instructions applied to\nadjacent words, operating from most-significant word towards least-significant word. At each step, a\nsingle word of the shifted result is computed. The least-significant word of the result is computed\nusing a shl instruction.

Use funnel shift to perform 32-bit left or right rotate by supplying the same value for source\narguments a and b.

PTX ISA Notes

Introduced in PTX ISA version 3.1.

Target ISA Notes

Requires sm_32 or higher.

Example

shf.l.clamp.b32  r3,r1,r0,16;\n\n// 128-bit left shift; n < 32\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] << n\nshf.l.clamp.b32  r7,r2,r3,n;\nshf.l.clamp.b32  r6,r1,r2,n;\nshf.l.clamp.b32  r5,r0,r1,n;\nshl.b32          r4,r0,n;\n\n// 128-bit right shift, arithmetic; n < 32\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] >> n\nshf.r.clamp.b32  r4,r0,r1,n;\nshf.r.clamp.b32  r5,r1,r2,n;\nshf.r.clamp.b32  r6,r2,r3,n;\nshr.s32          r7,r3,n;     // result is sign-extended\n\nshf.r.clamp.b32  r1,r0,r0,n;  // rotate right by n; n < 32\nshf.l.clamp.b32  r1,r0,r0,n;  // rotate left by n; n < 32\n\n// extract 32-bits from [r1,r0] starting at position n < 32\nshf.r.clamp.b32  r0,r0,r1,n;\n

", "tooltip": "Funnel shift.\n\nSyntax\n\nshf.l.mode.b32 d, a, b, c; // left shift\n\nshf.r.mode.b32 d, a, b, c; // right shift\n\n.mode = { .clamp, .wrap };\n\nDescription\n\nShift the 64-bit value formed by concatenating operands a and b left or right by the amount\n\nspecified by the unsigned 32-bit value in c. Operand b holds bits 63:32 and operand a\n\nholds bits 31:0 of the 64-bit source value. The source is shifted left or right by the clamped\n\nor wrapped value in c. For shf.l, the most-significant 32-bits of the result are written\n\ninto d; for shf.r, the least-significant 32-bits of the result are written into d.\n\nSemantics\n\nu32 n = (.mode == .clamp) ? min(c, 32) : c & 0x1f;\n\nswitch (shf.dir) { // shift concatenation of [b, a]\n\n case shf.l: // extract 32 msbs\n\n u32 d = (b << n) | (a >> (32-n));\n\n case shf.r: // extract 32 lsbs\n\n u32 d = (b << (32-n)) | (a >> n);\n\n}\n\nNotes\n\nUse funnel shift for multi-word shift operations and for rotate operations. The shift amount is\n\nlimited to the range 0..32 in clamp mode and 0..31 in wrap mode, so shifting multi-word\n\nvalues by distances greater than 32 requires first moving 32-bit words, then using shf to shift\n\nthe remaining 0..31 distance.\n\nTo shift data sizes greater than 64 bits to the right, use repeated shf.r instructions applied\n\nto adjacent words, operating from least-significant word towards most-significant word. At each\n\nstep, a single word of the shifted result is computed. The most-significant word of the result is\n\ncomputed using a shr.{u32,s32} instruction, which zero or sign fills based on the instruction\n\ntype.\n\nTo shift data sizes greater than 64 bits to the left, use repeated shf.l instructions applied to\n\nadjacent words, operating from most-significant word towards least-significant word. At each step, a\n\nsingle word of the shifted result is computed. The least-significant word of the result is computed\n\nusing a shl instruction.\n\nUse funnel shift to perform 32-bit left or right rotate by supplying the same value for source\n\narguments a and b.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires sm_32 or higher.\n\nExample\n\nshf.l.clamp.b32 r3,r1,r0,16;\n\n// 128-bit left shift; n < 32\n\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] << n\n\nshf.l.clamp.b32 r7,r2,r3,n;\n\nshf.l.clamp.b32 r6,r1,r2,n;\n\nshf.l.clamp.b32 r5,r0,r1,n;\n\nshl.b32 r4,r0,n;\n\n// 128-bit right shift, arithmetic; n < 32\n\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] >> n\n\nshf.r.clamp.b32 r4,r0,r1,n;\n\nshf.r.clamp.b32 r5,r1,r2,n;\n\nshf.r.clamp.b32 r6,r2,r3,n;\n\nshr.s32 r7,r3,n; // result is sign-extended\n\nshf.r.clamp.b32 r1,r0,r0,n; // rotate right by n; n < 32\n\nshf.l.clamp.b32 r1,r0,r0,n; // rotate left by n; n < 32\n\n// extract 32-bits from [r1,r0] starting at position n < 32\n\nshf.r.clamp.b32 r0,r0,r1,n;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf" }; case "shfl": return { "html": "For more information, visit shfl.sync .

Data Movement and Conversion Instructions: shfl.sync

\n\n\n

Syntax

shfl.sync.mode.b32  d[|p], a, b, c, membermask;\n\n.mode = { .up, .down, .bfly, .idx };\n

Description

Exchange register data between threads of a warp.

shfl.sync will cause executing thread to wait until all non-exited threads corresponding to\nmembermask have executed shfl.sync with the same qualifiers and same membermask value\nbefore resuming execution.

Operand membermask specifies a 32-bit integer which is a mask indicating threads participating\nin barrier where the bit position corresponds to thread\u2019s laneid.

shfl.sync exchanges register data between threads in membermask.

Each thread in the currently executing warp will compute a source lane index j based on input\noperands b and c and the mode. If the computed source lane index j is in range, the\nthread will copy the input operand a from lane j into its own destination register d;\notherwise, the thread will simply copy its own input a to destination d. The optional\ndestination predicate p is set to True if the computed source lane is in range, and\notherwise set to False.

Note that an out of range value of b may still result in a valid computed source lane index\nj. In this case, a data transfer occurs and the destination predicate p is True.

Note that results are undefined if a thread sources a register from an inactive thread or a thread\nthat is not in membermask.

Operand b specifies a source lane or source lane offset, depending on the mode.

Operand c contains two packed values specifying a mask for logically splitting warps into\nsub-segments and an upper bound for clamping the source lane index.

The behavior of shfl.sync is undefined if the executing thread is not in the membermask.

Note

For .target sm_6x or below, all threads in membermask must execute the same shfl.sync\ninstruction in convergence, and only threads belonging to some membermask can be active when\nthe shfl.sync instruction is executed. Otherwise, the behavior is undefined.

Semantics

// wait for all threads in membermask to arrive\nwait_for_specified_threads(membermask);\n\nlane[4:0]  = [Thread].laneid;  // position of thread in warp\nbval[4:0] = b[4:0];            // source lane or lane offset (0..31)\ncval[4:0] = c[4:0];            // clamp value\nsegmask[4:0] = c[12:8];\n\n// get value of source register a if thread is active and\n// guard predicate true, else unpredictable\nif (isActive(Thread) && isGuardPredicateTrue(Thread)) {\n    SourceA[lane] = a;\n} else {\n    // Value of SourceA[lane] is unpredictable for\n    // inactive/predicated-off threads in warp\n}\nmaxLane = (lane[4:0] & segmask[4:0]) | (cval[4:0] & ~segmask[4:0]);\nminLane = (lane[4:0] & segmask[4:0]);\n\nswitch (.mode) {\n    case .up:    j = lane - bval; pval = (j >= maxLane); break;\n    case .down:  j = lane + bval; pval = (j <= maxLane); break;\n    case .bfly:  j = lane ^ bval; pval = (j <= maxLane); break;\n    case .idx:   j = minLane  | (bval[4:0] & ~segmask[4:0]);\n                                 pval = (j <= maxLane); break;\n}\nif (!pval) j = lane;  // copy from own lane\nd = SourceA[j];       // copy input a from lane j\nif (dest predicate selected)\n    p = pval;\n

PTX ISA Notes

Introduced in PTX ISA version 6.0.

Target ISA Notes

Requires sm_30 or higher.

Examples

shfl.sync.up.b32  Ry|p, Rx, 0x1,  0x0, 0xffffffff;\n

", "tooltip": "Register data shuffle within threads of a warp.\n\nSyntax\n\nshfl.sync.mode.b32 d[|p], a, b, c, membermask;\n\n.mode = { .up, .down, .bfly, .idx };\n\nDescription\n\nExchange register data between threads of a warp.\n\nshfl.sync will cause executing thread to wait until all non-exited threads corresponding to\n\nmembermask have executed shfl.sync with the same qualifiers and same membermask value\n\nbefore resuming execution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin barrier where the bit position corresponds to thread\u2019s laneid.\n\nshfl.sync exchanges register data between threads in membermask.\n\nEach thread in the currently executing warp will compute a source lane index j based on input\n\noperands b and c and the mode. If the computed source lane index j is in range, the\n\nthread will copy the input operand a from lane j into its own destination register d;\n\notherwise, the thread will simply copy its own input a to destination d. The optional\n\ndestination predicate p is set to True if the computed source lane is in range, and\n\notherwise set to False.\n\nNote that an out of range value of b may still result in a valid computed source lane index\n\nj. In this case, a data transfer occurs and the destination predicate p is True.\n\nNote that results are undefined if a thread sources a register from an inactive thread or a thread\n\nthat is not in membermask.\n\nOperand b specifies a source lane or source lane offset, depending on the mode.\n\nOperand c contains two packed values specifying a mask for logically splitting warps into\n\nsub-segments and an upper bound for clamping the source lane index.\n\nThe behavior of shfl.sync is undefined if the executing thread is not in the membermask.\n\nNote\n\nFor .target sm_6x or below, all threads in membermask must execute the same shfl.sync\n\ninstruction in convergence, and only threads belonging to some membermask can be active when\n\nthe shfl.sync instruction is executed. Otherwise, the behavior is undefined.\n\nSemantics\n\n// wait for all threads in membermask to arrive\n\nwait_for_specified_threads(membermask);\n\nlane[4:0] = [Thread].laneid; // position of thread in warp\n\nbval[4:0] = b[4:0]; // source lane or lane offset (0..31)\n\ncval[4:0] = c[4:0]; // clamp value\n\nsegmask[4:0] = c[12:8];\n\n// get value of source register a if thread is active and\n\n// guard predicate true, else unpredictable\n\nif (isActive(Thread) && isGuardPredicateTrue(Thread)) {\n\n SourceA[lane] = a;\n\n} else {\n\n // Value of SourceA[lane] is unpredictable for\n\n // inactive/predicated-off threads in warp\n\n}\n\nmaxLane = (lane[4:0] & segmask[4:0]) | (cval[4:0] & ~segmask[4:0]);\n\nminLane = (lane[4:0] & segmask[4:0]);\n\nswitch (.mode) {\n\n case .up: j = lane - bval; pval = (j >= maxLane); break;\n\n case .down: j = lane + bval; pval = (j <= maxLane); break;\n\n case .bfly: j = lane ^ bval; pval = (j <= maxLane); break;\n\n case .idx: j = minLane | (bval[4:0] & ~segmask[4:0]);\n\n pval = (j <= maxLane); break;\n\n}\n\nif (!pval) j = lane; // copy from own lane\n\nd = SourceA[j]; // copy input a from lane j\n\nif (dest predicate selected)\n\n p = pval;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nshfl.sync.up.b32 Ry|p, Rx, 0x1, 0x0, 0xffffffff;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync" }; case "shl": return { "html": "For more information, visit shl .

Logic and Shift Instructions: shl

\n\n\n

Shift bits left, zero-fill on right.

Syntax

shl.type d, a, b;\n\n.type = { .b16, .b32, .b64 };\n

Description

Shift a left by the amount specified by unsigned 32-bit value in b.

Semantics

d = a << b;\n

Notes

Shift amounts greater than the register width N are clamped to N.

The sizes of the destination and first source operand must match, but not necessarily the type. The\nb operand must be a 32-bit value, regardless of the instruction type.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Example

shl.b32  q,a,2;\n

", "tooltip": "Shift bits left, zero-fill on right.\n\nSyntax\n\nshl.type d, a, b;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nShift a left by the amount specified by unsigned 32-bit value in b.\n\nSemantics\n\nd = a << b;\n\nNotes\n\nShift amounts greater than the register width N are clamped to N.\n\nThe sizes of the destination and first source operand must match, but not necessarily the type. The\n\nb operand must be a 32-bit value, regardless of the instruction type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExample\n\nshl.b32 q,a,2;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl" }; case "shr": return { "html": "For more information, visit shr .

Logic and Shift Instructions: shr

\n\n\n

Shift bits right, sign or zero-fill on left.

Syntax

shr.type d, a, b;\n\n.type = { .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Shift a right by the amount specified by unsigned 32-bit value in b. Signed shifts fill with\nthe sign bit, unsigned and untyped shifts fill with 0.

Semantics

d = a >> b;\n

Notes

Shift amounts greater than the register width N are clamped to N.

The sizes of the destination and first source operand must match, but not necessarily the type. The\nb operand must be a 32-bit value, regardless of the instruction type.

Bit-size types are included for symmetry with shl.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Example

shr.u16  c,a,2;\nshr.s32  i,i,1;\nshr.b16  k,i,j;\n

", "tooltip": "Shift bits right, sign or zero-fill on left.\n\nSyntax\n\nshr.type d, a, b;\n\n.type = { .b16, .b32, .b64,\n\n .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nShift a right by the amount specified by unsigned 32-bit value in b. Signed shifts fill with\n\nthe sign bit, unsigned and untyped shifts fill with 0.\n\nSemantics\n\nd = a >> b;\n\nNotes\n\nShift amounts greater than the register width N are clamped to N.\n\nThe sizes of the destination and first source operand must match, but not necessarily the type. The\n\nb operand must be a 32-bit value, regardless of the instruction type.\n\nBit-size types are included for symmetry with shl.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExample\n\nshr.u16 c,a,2;\n\nshr.s32 i,i,1;\n\nshr.b16 k,i,j;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr" }; case "sin": return { "html": "For more information, visit sin(fp) .

Floating Point Instructions: sin

\n\n\n

Find the sine of a value.

Syntax

sin.approx{.ftz}.f32  d, a;\n

Description

Find the sine of the angle a (in radians).

Semantics

d = sin(a);\n

Notes

sin.approx.f32 implements a fast approximation to sine.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	NaN
-subnormal	-0.0
-0.0	-0.0
+0.0	+0.0
+subnormal	+0.0
+Inf	NaN
NaN	NaN

The maximum absolute error is 2^-20.9 in quadrant 00.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

sin.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

Subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

sin.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\nintroduced in PTX ISA version 1.4.

For PTX ISA version 1.4 and later, the .approx modifier is required.

For PTX ISA versions 1.0 through 1.3, sin.f32 defaults to sin.approx.ftz.f32.

Target ISA Notes

Supported on all target architectures.

Examples

sin.approx.ftz.f32  sa, a;\n

", "tooltip": "Find the sine of a value.\n\nSyntax\n\nsin.approx{.ftz}.f32 d, a;\n\nDescription\n\nFind the sine of the angle a (in radians).\n\nSemantics\n\nd = sin(a);\n\nNotes\n\nsin.approx.f32 implements a fast approximation to sine.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-subnormal\n\n-0.0\n\n-0.0\n\n-0.0\n\n+0.0\n\n+0.0\n\n+subnormal\n\n+0.0\n\n+Inf\n\nNaN\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-20.9 in quadrant 00.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nsin.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nsin.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, sin.f32 defaults to sin.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nsin.approx.ftz.f32 sa, a;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin" }; case "slct": return { "html": "For more information, visit slct .

Comparison and Selection Instructions: slct

\n\n\n

Select one source operand, based on the sign of the third operand.

Syntax

slct.dtype.s32        d, a, b, c;\nslct{.ftz}.dtype.f32  d, a, b, c;\n\n.dtype = { .b16, .b32, .b64,\n           .u16, .u32, .u64,\n           .s16, .s32, .s64,\n                 .f32, .f64 };\n

Description

Conditional selection. If c \u2265 0, a is stored in d, otherwise b is stored in\nd. Operands d, a, and b are treated as a bitsize type of the same width as the first\ninstruction type; operand c must match the second instruction type (.s32 or .f32). The\nselected input is copied to the output without modification.

Semantics

d = (c >= 0) ? a : b;\n

Floating Point Notes

For .f32 comparisons, negative zero equals zero.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

slct.ftz.dtype.f32 flushes subnormal values of operand c to sign-preserving zero, and\noperand a is selected.

sm_1x

slct.dtype.f32 flushes subnormal values of operand c to sign-preserving zero, and operand\na is selected.

Modifier .ftz applies only to .f32 comparisons.

If operand c is NaN, the comparison is unordered and operand b is selected.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

slct.f64 requires sm_13 or higher.

Examples

slct.u32.s32  x, y, z, val;\nslct.ftz.u64.f32  A, B, C, fval;\n

", "tooltip": "Select one source operand, based on the sign of the third operand.\n\nSyntax\n\nslct.dtype.s32 d, a, b, c;\n\nslct{.ftz}.dtype.f32 d, a, b, c;\n\n.dtype = { .b16, .b32, .b64,\n\n .u16, .u32, .u64,\n\n .s16, .s32, .s64,\n\n .f32, .f64 };\n\nDescription\n\nConditional selection. If c \u2265 0, a is stored in d, otherwise b is stored in\n\nd. Operands d, a, and b are treated as a bitsize type of the same width as the first\n\ninstruction type; operand c must match the second instruction type (.s32 or .f32). The\n\nselected input is copied to the output without modification.\n\nSemantics\n\nd = (c >= 0) ? a : b;\n\nFloating Point Notes\n\nFor .f32 comparisons, negative zero equals zero.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nslct.ftz.dtype.f32 flushes subnormal values of operand c to sign-preserving zero, and\n\noperand a is selected.\n\nsm_1xslct.dtype.f32 flushes subnormal values of operand c to sign-preserving zero, and operand\n\na is selected.\n\nModifier .ftz applies only to .f32 comparisons.\n\nIf operand c is NaN, the comparison is unordered and operand b is selected.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nslct.f64 requires sm_13 or higher.\n\nExamples\n\nslct.u32.s32 x, y, z, val;\n\nslct.ftz.u64.f32 A, B, C, fval;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct" }; case "smid": return { "html": "For more information, visit smid .

Special Registers: %smid

\n\n\n

SM identifier.

Syntax (predefined)

.sreg .u32 %smid;\n

Description

A predefined, read-only special register that returns the processor (SM) identifier on which a\nparticular thread is executing. The SM identifier ranges from 0 to %nsmid-1. The SM\nidentifier numbering is not guaranteed to be contiguous.

Notes

Note that %smid is volatile and returns the location of a thread at the moment when read, but\nits value may change during execution, e.g. due to rescheduling of threads following\npreemption. %smid is intended mainly to enable profiling and diagnostic code to sample and log\ninformation such as work place mapping and load distribution.

PTX ISA Notes

Introduced in PTX ISA version 1.3.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u32  %r, %smid;\n

", "tooltip": "SM identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %smid;\n\nDescription\n\nA predefined, read-only special register that returns the processor (SM) identifier on which a\n\nparticular thread is executing. The SM identifier ranges from 0 to %nsmid-1. The SM\n\nidentifier numbering is not guaranteed to be contiguous.\n\nNotes\n\nNote that %smid is volatile and returns the location of a thread at the moment when read, but\n\nits value may change during execution, e.g. due to rescheduling of threads following\n\npreemption. %smid is intended mainly to enable profiling and diagnostic code to sample and log\n\ninformation such as work place mapping and load distribution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32 %r, %smid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-smid" }; case "sqrt": return { "html": "For more information, visit sqrt(fp) .

Floating Point Instructions: sqrt

\n\n\n

Take the square root of a value.

Syntax

sqrt.approx{.ftz}.f32  d, a; // fast, approximate square root\nsqrt.rnd{.ftz}.f32     d, a; // IEEE 754 compliant rounding\nsqrt.rnd.f64           d, a; // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Compute sqrt(a) and store the result in d.

Semantics

d = sqrt(a);\n

Notes

sqrt.approx.f32 implements a fast approximation to square root.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	NaN
-normal	NaN
-subnormal	-0.0
-0.0	-0.0
+0.0	+0.0
+subnormal	+0.0
+Inf	+Inf
NaN	NaN

Square root with IEEE 754 compliant rounding:

Rounding modifiers (no default):

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

sqrt.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

sqrt.f64 supports subnormal numbers.

sqrt.f32 flushes subnormal inputs and results to sign-preserving zero.

PTX ISA Notes

sqrt.f32 and sqrt.f64 introduced in PTX ISA version 1.0. sqrt.rn.f64 and explicit\nmodifiers .approx and .ftz were introduced in PTX ISA version 1.4. General rounding\nmodifiers were added in PTX ISA version 2.0.

For PTX ISA version 1.4 and later, one of .approx or .rnd is required.

For PTX ISA versions 1.0 through 1.3, sqrt.f32 defaults to sqrt.approx.ftz.f32, and\nsqrt.f64 defaults to sqrt.rn.f64.

Target ISA Notes

sqrt.approx.f32 supported on all target architectures.

sqrt.rnd.f32 requires sm_20 or higher.

sqrt.rn.f64 requires sm_13 or higher, or .target map_f64_to_f32.

sqrt.{rz,rm,rp}.f64 requires sm_20 or higher.

Examples

sqrt.approx.ftz.f32  r,x;\nsqrt.rn.ftz.f32      r,x;\nsqrt.rn.f64          r,x;\n

", "tooltip": "Take the square root of a value.\n\nSyntax\n\nsqrt.approx{.ftz}.f32 d, a; // fast, approximate square root\n\nsqrt.rnd{.ftz}.f32 d, a; // IEEE 754 compliant rounding\n\nsqrt.rnd.f64 d, a; // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute sqrt(a) and store the result in d.\n\nSemantics\n\nd = sqrt(a);\n\nNotes\n\nsqrt.approx.f32 implements a fast approximation to square root.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-normal\n\nNaN\n\n-subnormal\n\n-0.0\n\n-0.0\n\n-0.0\n\n+0.0\n\n+0.0\n\n+subnormal\n\n+0.0\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nSquare root with IEEE 754 compliant rounding:\n\nRounding modifiers (no default):\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds towards negative infinity\n\n.rpmantissa LSB rounds towards positive infinity\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nsqrt.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xsqrt.f64 supports subnormal numbers.\n\nsqrt.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nsqrt.f32 and sqrt.f64 introduced in PTX ISA version 1.0. sqrt.rn.f64 and explicit\n\nmodifiers .approx and .ftz were introduced in PTX ISA version 1.4. General rounding\n\nmodifiers were added in PTX ISA version 2.0.\n\nFor PTX ISA version 1.4 and later, one of .approx or .rnd is required.\n\nFor PTX ISA versions 1.0 through 1.3, sqrt.f32 defaults to sqrt.approx.ftz.f32, and\n\nsqrt.f64 defaults to sqrt.rn.f64.\n\nTarget ISA Notes\n\nsqrt.approx.f32 supported on all target architectures.\n\nsqrt.rnd.f32 requires sm_20 or higher.\n\nsqrt.rn.f64 requires sm_13 or higher, or .target map_f64_to_f32.\n\nsqrt.{rz,rm,rp}.f64 requires sm_20 or higher.\n\nExamples\n\nsqrt.approx.ftz.f32 r,x;\n\nsqrt.rn.ftz.f32 r,x;\n\nsqrt.rn.f64 r,x;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt" }; case "st": return { "html": "For more information, visit st , st.async .

Data Movement and Conversion Instructions: st

\n\n\n

Store a register variable to an addressable state space variable.

Syntax

st{.weak}{.ss}{.cop}{.level::cache_hint}{.vec}.type   [a], b{, cache-policy};\nst{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n                                                      [a], b{, cache-policy};\nst.volatile{.ss}{.vec}.type                           [a], b;\nst.relaxed.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n                                                      [a], b{, cache-policy};\nst.release.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n                                                      [a], b{, cache-policy};\n\n.ss =                       { .global, .local, .param, .shared{::cta, ::cluster} };\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate };\n.level::cache_hint =        { .L2::cache_hint };\n.cop =                      { .wb, .cg, .cs, .wt };\n.sem =                      { .relaxed, .release };\n.scope =                    { .cta, .cluster, .gpu, .sys };\n.vec =                      { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n

Description

Store the value of register variable b in the location specified by the destination address\noperand a in specified state space. If no state space is given, perform the store using Generic\nAddressing. Stores to const memory are illegal.

If no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.

Supported addressing modes for operand a and alignment requirements are described in Addresses\nas Operands

Instruction st.param used for passing arguments to device function cannot be predicated. See\nParameter State Space and Function Declarations and\nDefinitions for descriptions of the proper use\nof st.param.

The qualifiers .relaxed and .release indicate memory synchronization as described in the\nMemory Consistency Model. The .scope qualifier\nindicates the set of threads with which an st.relaxed or st.release instruction can directly\nsynchronize¹. The .weak qualifier indicates a memory instruction with no\nsynchronization. The effects of this instruction become visible to other threads only when\nsynchronization is established by other means.

The .weak, .volatile, .relaxed and .release qualifiers are mutually exclusive. When\nnone of these is specified, the .weak qualifier is assumed by default.

An st.volatile operation is always performed and it will not be reordered with respect to other\nvolatile operations to the same memory location. st.volatile has the same memory\nsynchronization semantics as st.relaxed.sys.

The qualifiers .volatile, .relaxed and .release may be used only with .global and\n.shared spaces and with generic addressing, where the address points to .global or\n.shared space. Cache operations are not permitted with these qualifiers.

The qualifier .level::eviction_priority specifies the eviction policy that will be used during\nmemory access.

The qualifier .level::cache_hint is only supported for .global state space and for generic\naddressing where the address points to the .global state space.

cache-policy is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.

¹ This synchronization is further extended to other threads through the transitive nature of\ncausality order, as described in the memory consistency model.

Semantics

d = a;                // named variable d\n*(&a+immOffset) = b;            // variable-plus-offset\n*a = b;               // register\n*(a+immOffset) = b;   // register-plus-offset\n*(immAddr) = b;       // immediate address\n

Notes

Operand b must be in the .reg state space.

A source register wider than the specified type may be used. The lower n bits corresponding to\nthe instruction-type width are stored to memory. See\nTable 24\nfor a description of these relaxed type-checking rules.

.f16 data resulting from a cvt instruction may be stored using st.b16.

.f16x2 data may be stored using st.b32.

PTX ISA Notes

st introduced in PTX ISA version 1.0. st.volatile introduced in PTX ISA version 1.1.

Generic addressing and cache operations introduced in PTX ISA version 2.0.

Support for scope qualifier, .relaxed, .release, .weak qualifiers introduced in PTX ISA\nversion 6.0.

Support for .level::eviction_priority and .level::cache_hint qualifiers introduced in PTX\nISA version 7.4.

Support for .cluster scope qualifier introduced in PTX ISA version 7.8.

Support for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.

Target ISA Notes

st.f64 requires sm_13 or higher.

Support for scope qualifier, .relaxed, .release, .weak qualifiers require sm_70 or\nhigher.

Generic addressing requires sm_20 or higher.

Cache operations require sm_20 or higher.

Support for .level::eviction_priority qualifier requires sm_70 or higher.

Support for .level::cache_hint qualifier requires sm_80 or higher.

Support for .cluster scope qualifier requires sm_90 or higher.

Sub-qualifier ::cta requires sm_30 or higher.

Sub-qualifier ::cluster requires sm_90 or higher.

Examples

st.global.f32    [a],b;\nst.local.b32     [q+4],a;\nst.global.v4.s32 [p],Q;\nst.local.b32     [q+-8],a; // negative offset\nst.local.s32     [100],r7; // immediate address\n\ncvt.f16.f32      %r,%r;    // %r is 32-bit register\nst.b16           [fs],%r;  // store lower\nst.global.relaxed.sys.u32 [gbl], %r0;\nst.shared.release.cta.u32 [sh], %r1;\nst.global.relaxed.cluster.u32 [gbl], %r2;\nst.shared::cta.release.cta.u32 [sh + 4], %r1;\nst.shared::cluster.u32 [sh + 8], %r1;\n\nst.global.L1::no_allocate.f32 [p], a;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\nst.global.L2::cache_hint.b32  [a], b, cache-policy;\n

Data Movement and Conversion Instructions: st.async

\n\n\n

Asynchronous store operation on shared memory.

Syntax

st.async{.weak}{.ss}{.completion_mechanism}{.vec}.type [a], b, [mbar];\n\n.ss   =                 { .shared::cluster };\n.type =                 { .b32, .b64,\n                          .u32, .u64,\n                          .s32, .s64,\n                          .f32, .f64 };\n.vec  =                 { .v2, .v4 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n

Description

st.async is a non-blocking instruction which initiates an asynchronous store operation that\nstores the value specified by source operand register b to the destination memory location\nspecified by operand a.

Operand a represents destination address and must be a register or of the form register +\nimmOff as described in Addresses as Operands.

The shared memory addresses of destination operand a and the mbarrier object mbar, must\nmeet all of the following conditions:

They belong to the same CTA.
They are different to the CTA of the executing thread but must be within the same cluster.

Otherwise, the behavior is undefined.

The state space of the address {.ss}, if specified, is applicable to both operands a and\nmbar. If not specified, then Generic Addressing is used for\nboth a and mbar. If the generic addresses specified do not fall within the address window of\n.shared::cluster state space, then the behaviour is undefined.

The store operation in st.async is treated as a weak memory operation and the complete_tx\noperation on the mbarrier has .release semantics at the .cluster scope as described in the\nMemory Consistency Model.

PTX ISA Notes

Introduced in PTX ISA version 8.1.

Target ISA Notes

Requires sm_90 or higher.

Examples

st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [addr], b, [mbar_addr]\n

", "tooltip": "=====Data Movement and Conversion Instructions: st\n\n\n\nStore a register variable to an addressable state space variable.\n\nSyntax\n\nst{.weak}{.ss}{.cop}{.level::cache_hint}{.vec}.type [a], b{, cache-policy};\n\nst{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n\n [a], b{, cache-policy};\n\nst.volatile{.ss}{.vec}.type [a], b;\n\nst.relaxed.scope{.ss}{.level::evicti...\n\n=====Data Movement and Conversion Instructions: st.async\n\n\n\nAsynchronous store operation on shared memory.\n\nSyntax\n\nst.async{.weak}{.ss}{.completion_mechanism}{.vec}.type [a], b, [mbar];\n\n.ss = { .shared::cluster };\n\n.type = { .b32, .b64,\n\n .u32, .u64,\n\n .s32, .s64,\n\n .f32, .f64 };\n\n.vec = { .v2, .v4 };\n\n.completion_mechanism = { .mbarrier::... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st" }; case "stackrestore": return { "html": "For more information, visit stackrestore .

Stack Manipulation Instructions: stackrestore

\n\n\n

Update the stack pointer with a new value.

Syntax

stackrestore.type  a;\n\n.type = { .u32, .u64 };\n

Description

Sets the current stack pointer to source register a.

When stackrestore is used with operand a written by a prior stacksave instruction, it\nwill effectively restore the state of stack as it was before stacksave was executed. Note that\nif stackrestore is used with an arbitrary value of a, it may cause corruption of stack\npointer. This implies that the correct use of this feature requires that stackrestore.type a is\nused after stacksave.type a without redefining the value of a between them.

Operand a has the same type as the instruction type.

Semantics

stackptr = a;\n

PTX ISA Notes

Introduced in PTX ISA version 7.3.

Preview Feature:: stackrestore is a preview feature in PTX ISA version 7.3. All details are subject to change\nwith no guarantees of backward compatibility on future PTX ISA versions or SM architectures.
\n

Target ISA Notes

stackrestore requires sm_52 or higher.

Examples

.reg .u32 ra;\nstacksave.u32 ra;\n// Code that may modify stack pointer\n...\nstackrestore.u32 ra;\n

", "tooltip": "Update the stack pointer with a new value.\n\nSyntax\n\nstackrestore.type a;\n\n.type = { .u32, .u64 };\n\nDescription\n\nSets the current stack pointer to source register a.\n\nWhen stackrestore is used with operand a written by a prior stacksave instruction, it\n\nwill effectively restore the state of stack as it was before stacksave was executed. Note that\n\nif stackrestore is used with an arbitrary value of a, it may cause corruption of stack\n\npointer. This implies that the correct use of this feature requires that stackrestore.type a is\n\nused after stacksave.type a without redefining the value of a between them.\n\nOperand a has the same type as the instruction type.\n\nSemantics\n\nstackptr = a;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:stackrestore is a preview feature in PTX ISA version 7.3. All details are subject to change\n\nwith no guarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nstackrestore requires sm_52 or higher.\n\nExamples\n\n.reg .u32 ra;\n\nstacksave.u32 ra;\n\n// Code that may modify stack pointer\n\n...\n\nstackrestore.u32 ra;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore" }; case "stacksave": return { "html": "For more information, visit stacksave .

Stack Manipulation Instructions: stacksave

\n\n\n

Save the value of stack pointer into a register.

Syntax

stacksave.type  d;\n\n.type = { .u32, .u64 };\n

Description

Copies the current value of stack pointer into the destination register d. Pointer returned by\nstacksave can be used in a subsequent stackrestore instruction to restore the stack\npointer. If d is modified prior to use in stackrestore instruction, it may corrupt data in\nthe stack.

Destination operand d has the same type as the instruction type.

Semantics

d = stackptr;\n

PTX ISA Notes

Introduced in PTX ISA version 7.3.

Preview Feature:: stacksave is a preview feature in PTX ISA version 7.3. All details are subject to change with\nno guarantees of backward compatibility on future PTX ISA versions or SM architectures.
\n

Target ISA Notes

stacksave requires sm_52 or higher.

Examples

.reg .u32 rd;\nstacksave.u32 rd;\n\n.reg .u64 rd1;\nstacksave.u64 rd1;\n

", "tooltip": "Save the value of stack pointer into a register.\n\nSyntax\n\nstacksave.type d;\n\n.type = { .u32, .u64 };\n\nDescription\n\nCopies the current value of stack pointer into the destination register d. Pointer returned by\n\nstacksave can be used in a subsequent stackrestore instruction to restore the stack\n\npointer. If d is modified prior to use in stackrestore instruction, it may corrupt data in\n\nthe stack.\n\nDestination operand d has the same type as the instruction type.\n\nSemantics\n\nd = stackptr;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:stacksave is a preview feature in PTX ISA version 7.3. All details are subject to change with\n\nno guarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nstacksave requires sm_52 or higher.\n\nExamples\n\n.reg .u32 rd;\n\nstacksave.u32 rd;\n\n.reg .u64 rd1;\n\nstacksave.u64 rd1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave" }; case "sub": return { "html": "For more information, visit sub(fp) , sub(fp16) , sub(int) , sub.cc .

Floating Point Instructions: sub

\n\n\n

Subtract one value from another.

Syntax

sub{.rnd}{.ftz}{.sat}.f32  d, a, b;\nsub{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n

Description

Performs subtraction and writes the resulting value into a destination register.

Semantics

d = a - b;\n

Notes

Rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n
.rz: mantissa LSB rounds towards zero
\n
.rm: mantissa LSB rounds towards negative infinity
\n
.rp: mantissa LSB rounds towards positive infinity
\n

The default value of rounding modifier is .rn. Note that a sub instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A sub instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, mul/sub sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.

Subnormal numbers:

sm_20+

By default, subnormal numbers are supported.

sub.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.

sm_1x

sub.f64 supports subnormal numbers.

sub.f32 flushes subnormal inputs and results to sign-preserving zero.

Saturation modifier:

sub.sat.f32 clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

sub.f32 supported on all target architectures.

sub.f64 requires sm_13 or higher.

Rounding modifiers have the following target requirements:

.rn, .rz

available for all targets

.rm, .rp

for sub.f64, requires sm_13 or higher.

for sub.f32, requires sm_20 or higher.

Examples

sub.f32 c,a,b;\nsub.rn.ftz.f32  f1,f2,f3;\n

Half Precision Floating Point Instructions: sub

\n\n\n

Subtract two values.

Syntax

sub{.rnd}{.ftz}{.sat}.f16   d, a, b;\nsub{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nsub{.rnd}.bf16   d, a, b;\nsub{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n

Description

Performs subtraction and writes the resulting value into a destination register.

For .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then subtracted in parallel to produce .f16x2 or .bf16x2\nresult in destination.

Semantics

if (type == f16 || type == bf16) {\n    d = a - b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i < 2; i++) {\n         d[i] = fA[i] - fB[i];\n    }\n}\n

Notes

Rounding modifiers:

.rn: mantissa LSB rounds to nearest even
\n

Subnormal numbers:: By default, subnormal numbers are supported.\nsub.ftz.{f16, f16x2} flushes subnormal inputs and results to sign-preserving zero.
\n
Saturation modifier:: sub.sat.{f16, f16x2} clamps the result to [0.0, 1.0]. NaN results are flushed to +0.0f.
\n

PTX ISA Notes

Introduced in PTX ISA version 4.2.

sub{.rnd}.bf16 and sub{.rnd}.bf16x2 introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_53 or higher.

sub{.rnd}.bf16 and sub{.rnd}.bf16x2 requires sm_90 or higher.

Examples

// scalar f16 subtractions\nsub.f16        d0, a0, b0;\nsub.rn.f16     d1, a1, b1;\nsub.bf16       bd0, ba0, bb0;\nsub.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 subtraction\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nsub.f16x2  p3, p1, p2;   // SIMD f16x2 subtraction\n\n// SIMD bf16 subtraction\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nsub.bf16x2  p6, p4, p5;       // SIMD bf16x2 subtraction\n\n// SIMD fp16 subtraction\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nsub.f16x2       f2, f0, f1;     // SIMD f16x2 subtraction\n\n// SIMD bf16 subtraction\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nsub.bf16x2      f5, f3, f4;      // SIMD bf16x2 subtraction\n

Integer Arithmetic Instructions: sub

\n\n\n

Subtract one value from another.

Syntax

sub.type       d, a, b;\nsub{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n

Description

Performs subtraction and writes the resulting value into a destination register.

Semantics

d = a - b;\n

Notes

Saturation modifier:

.sat: limits result to MININT..MAXINT (no overflow) for the size of the operation. Applies only to\n.s32 type.
\n

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

sub.s32 c,a,b;\n

Extended-Precision Arithmetic Instructions: sub.cc

\n\n\n

Subtract one value from another, with borrow-out.

Syntax

sub.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n

Description

Performs integer subtraction and writes the borrow-out value into the condition code register.

Semantics

d = a - b;\n

borrow-out written to CC.CF

Notes

No integer rounding modifiers.

No saturation.

Behavior is the same for unsigned and signed integers.

PTX ISA Notes

32-bit sub.cc introduced in PTX ISA version 1.2.

64-bit sub.cc introduced in PTX ISA version 4.3.

Target ISA Notes

32-bit sub.cc is supported on all target architectures.

64-bit sub.cc requires sm_20 or higher.

Examples

@p  sub.cc.u32   x1,y1,z1;   // extended-precision subtraction\n@p  subc.cc.u32  x2,y2,z2;   // of two 128-bit values\n@p  subc.cc.u32  x3,y3,z3;\n@p  subc.u32     x4,y4,z4;\n

", "tooltip": "=====Floating Point Instructions: sub\n\n\n\nSubtract one value from another.\n\nSyntax\n\nsub{.rnd}{.ftz}{.sat}.f32 d, a, b;\n\nsub{.rnd}.f64 d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms subtraction and writes the resulting value into a destination register.\n\nSemantics\n\nd = a - b;\n\nNotes\n\nRounding modifiers:\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds towards negative in...\n\n=====Half Precision Floating Point Instructions: sub\n\n\n\nSubtract two values.\n\nSyntax\n\nsub{.rnd}{.ftz}{.sat}.f16 d, a, b;\n\nsub{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nsub{.rnd}.bf16 d, a, b;\n\nsub{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms subtraction and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then subtra...\n\n=====Integer Arithmetic Instructions: sub\n\n\n\nSubtract one value from another.\n\nSyntax\n\nsub.type d, a, b;\n\nsub{.sat}.s32 d, a, b; // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n\n .s16, .s32, .s64 };\n\nDescription\n\nPerforms subtraction and writes the resulting value into a destination register.\n\nSemantics\n\nd = a - b;\n\nNotes\n\nSaturation modifier:\n\n.satlimits result to MININT..MAXINT (no overflow) for the size of the operation....\n\n=====Extended-Precision Arithmetic Instructions: sub.cc\n\n\n\nSubtract one value from another, with borrow-out.\n\nSyntax\n\nsub.cc.type d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer subtraction and writes the borrow-out value into the condition code register.\n\nSemantics\n\nd = a - b;\n\nborrow-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit s... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub" }; case "subc": return { "html": "For more information, visit subc .

Extended-Precision Arithmetic Instructions: subc

\n\n\n

Subtract one value from another, with borrow-in and optional borrow-out.

Syntax

subc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n

Description

Performs integer subtraction with borrow-in and optionally writes the borrow-out value into the\ncondition code register.

Semantics

d = a  - (b + CC.CF);\n

if .cc specified, borrow-out written to CC.CF

Notes

No integer rounding modifiers.

No saturation.

Behavior is the same for unsigned and signed integers.

PTX ISA Notes

32-bit subc introduced in PTX ISA version 1.2.

64-bit subc introduced in PTX ISA version 4.3.

Target ISA Notes

32-bit subc is supported on all target architectures.

64-bit subc requires sm_20 or higher.

Examples

@p  sub.cc.u32   x1,y1,z1;   // extended-precision subtraction\n@p  subc.cc.u32  x2,y2,z2;   // of two 128-bit values\n@p  subc.cc.u32  x3,y3,z3;\n@p  subc.u32     x4,y4,z4;\n

", "tooltip": "Subtract one value from another, with borrow-in and optional borrow-out.\n\nSyntax\n\nsubc{.cc}.type d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer subtraction with borrow-in and optionally writes the borrow-out value into the\n\ncondition code register.\n\nSemantics\n\nd = a - (b + CC.CF);\n\nif .cc specified, borrow-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit subc introduced in PTX ISA version 1.2.\n\n64-bit subc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\n32-bit subc is supported on all target architectures.\n\n64-bit subc requires sm_20 or higher.\n\nExamples\n\n@p sub.cc.u32 x1,y1,z1; // extended-precision subtraction\n\n@p subc.cc.u32 x2,y2,z2; // of two 128-bit values\n\n@p subc.cc.u32 x3,y3,z3;\n\n@p subc.u32 x4,y4,z4;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc" }; case "suld": return { "html": "For more information, visit suld .

Surface Instructions: suld

\n\n\n

Load from surface memory.

Syntax

suld.b.geom{.cop}.vec.dtype.clamp  d, [a, b];  // unformatted\n\n.geom  = { .1d, .2d, .3d, .a1d, .a2d };\n.cop   = { .ca, .cg, .cs, .cv };               // cache operation\n.vec   = { none, .v2, .v4 };\n.dtype = { .b8 , .b16, .b32, .b64 };\n.clamp = { .trap, .clamp, .zero };\n

Description

suld.b.{1d,2d,3d}

Load from surface memory using a surface coordinate vector. The instruction loads data from the\nsurface named by operand a at coordinates given by operand b into destination d. Operand\na is a .surfref variable or .u64 register. Operand b is a scalar or singleton tuple\nfor 1d surfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d\nsurfaces, where the fourth element is ignored. Coordinate elements are of type .s32.

suld.b performs an unformatted load of binary data. The lowest dimension coordinate represents a\nbyte offset into the surface and is not scaled, and the size of the data transfer matches the size\nof destination operand d.

suld.b.{a1d,a2d}

Surface layer selection, followed by a load from the selected surface. The instruction first selects\na surface layer from the surface array named by operand a using the index given by the first\nelement of the array coordinate vector b. The instruction then loads data from the selected\nsurface at coordinates given by the remaining elements of operand b into destination\nd. Operand a is a .surfref variable or .u64 register. Operand b is a bit-size\ntype vector or tuple containing an index into the array of surfaces followed by coordinates within\nthe selected surface, as follows:

For 1d surface arrays, operand b has type .v2.b32. The first element is interpreted as an\nunsigned integer index (.u32) into the surface array, and the second element is interpreted as a\n1d surface coordinate of type .s32.

For 2d surface arrays, operand b has type .v4.b32. The first element is interpreted as an\nunsigned integer index (.u32) into the surface array, and the next two elements are interpreted\nas 2d surface coordinates of type .s32. The fourth element is ignored.

A surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.

The .clamp field specifies how to handle out-of-bounds addresses:

.trap: causes an execution trap on out-of-bounds addresses
\n
.clamp: loads data at the nearest surface location (sized appropriately)
\n
.zero: loads zero for out-of-bounds addresses
\n

Indirect surface access

Beginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\na .surfref variable.

PTX ISA Notes

suld.b.trap introduced in PTX ISA version 1.5.

Additional clamp modifiers and cache operations introduced in PTX ISA version 2.0.

suld.b.3d andsuld.b.{a1d,a2d} introduced in PTX ISA version 3.0.

Indirect surface access introduced in PTX ISA version 3.1.

Target ISA Notes

suld.b supported on all target architectures.

sm_1x targets support only the .trap clamping modifier.

suld.3d andsuld.{a1d,a2d} require sm_20 or higher.

Indirect surface access requires sm_20 or higher.

Cache operations require sm_20 or higher.

Examples

suld.b.1d.v4.b32.trap  {s1,s2,s3,s4}, [surf_B, {x}];\nsuld.b.3d.v2.b64.trap  {r1,r2}, [surf_A, {x,y,z,w}];\nsuld.b.a1d.v2.b32      {r0,r1}, [surf_C, {idx,x}];\nsuld.b.a2d.b32         r0, [surf_D, {idx,x,y,z}];  // z ignored\n

", "tooltip": "Load from surface memory.\n\nSyntax\n\nsuld.b.geom{.cop}.vec.dtype.clamp d, [a, b]; // unformatted\n\n.geom = { .1d, .2d, .3d, .a1d, .a2d };\n\n.cop = { .ca, .cg, .cs, .cv }; // cache operation\n\n.vec = { none, .v2, .v4 };\n\n.dtype = { .b8 , .b16, .b32, .b64 };\n\n.clamp = { .trap, .clamp, .zero };\n\nDescription\n\nsuld.b.{1d,2d,3d}\n\nLoad from surface memory using a surface coordinate vector. The instruction loads data from the\n\nsurface named by operand a at coordinates given by operand b into destination d. Operand\n\na is a .surfref variable or .u64 register. Operand b is a scalar or singleton tuple\n\nfor 1d surfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d\n\nsurfaces, where the fourth element is ignored. Coordinate elements are of type .s32.\n\nsuld.b performs an unformatted load of binary data. The lowest dimension coordinate represents a\n\nbyte offset into the surface and is not scaled, and the size of the data transfer matches the size\n\nof destination operand d.\n\nsuld.b.{a1d,a2d}\n\nSurface layer selection, followed by a load from the selected surface. The instruction first selects\n\na surface layer from the surface array named by operand a using the index given by the first\n\nelement of the array coordinate vector b. The instruction then loads data from the selected\n\nsurface at coordinates given by the remaining elements of operand b into destination\n\nd. Operand a is a .surfref variable or .u64 register. Operand b is a bit-size\n\ntype vector or tuple containing an index into the array of surfaces followed by coordinates within\n\nthe selected surface, as follows:\n\nFor 1d surface arrays, operand b has type .v2.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the second element is interpreted as a\n\n1d surface coordinate of type .s32.\n\nFor 2d surface arrays, operand b has type .v4.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the next two elements are interpreted\n\nas 2d surface coordinates of type .s32. The fourth element is ignored.\n\nA surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\nThe .clamp field specifies how to handle out-of-bounds addresses:\n\n.trapcauses an execution trap on out-of-bounds addresses\n\n.clamploads data at the nearest surface location (sized appropriately)\n\n.zeroloads zero for out-of-bounds addresses\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nsuld.b.trap introduced in PTX ISA version 1.5.\n\nAdditional clamp modifiers and cache operations introduced in PTX ISA version 2.0.\n\nsuld.b.3d andsuld.b.{a1d,a2d} introduced in PTX ISA version 3.0.\n\nIndirect surface access introduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nsuld.b supported on all target architectures.\n\nsm_1x targets support only the .trap clamping modifier.\n\nsuld.3d andsuld.{a1d,a2d} require sm_20 or higher.\n\nIndirect surface access requires sm_20 or higher.\n\nCache operations require sm_20 or higher.\n\nExamples\n\nsuld.b.1d.v4.b32.trap {s1,s2,s3,s4}, [surf_B, {x}];\n\nsuld.b.3d.v2.b64.trap {r1,r2}, [surf_A, {x,y,z,w}];\n\nsuld.b.a1d.v2.b32 {r0,r1}, [surf_C, {idx,x}];\n\nsuld.b.a2d.b32 r0, [surf_D, {idx,x,y,z}]; // z ignored\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld" }; case "suq": return { "html": "For more information, visit suq .

Surface Instructions: suq

\n\n\n

Query a surface attribute.

Syntax

suq.query.b32   d, [a];\n\n.query = { .width, .height, .depth,\n           .channel_data_type, .channel_order,\n           .array_size, .memory_layout };\n

Description

Query an attribute of a surface. Operand a is a .surfref variable or a .u64 register.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Query	Returns
`.width` \n `.height` \n `.depth` \n	value in elements
`.channel_data_type`	Unsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both `channel_data_type` and `channel_order` queries.
`.channel_order`	Unsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both `channel_data_type` and `channel_order` queries.
`.array_size`	For a surface array, number of surfaces in array, 0 otherwise.
`.memory_layout`	`1` for surface with linear memory layout; `0` otherwise

Indirect surface access

PTX ISA Notes

Introduced in PTX ISA version 1.5.

Channel data type and channel order queries added in PTX ISA version 2.1.

Indirect surface access introduced in PTX ISA version 3.1.

The .array_size query was added in PTX ISA version 4.1.

The .memory_layout query was added in PTX ISA version 4.2.

Target ISA Notes

Supported on all target architectures.

Indirect surface access requires sm_20 or higher.

Examples

suq.width.b32       %r1, [surf_A];\n

", "tooltip": "Query a surface attribute.\n\nSyntax\n\nsuq.query.b32 d, [a];\n\n.query = { .width, .height, .depth,\n\n .channel_data_type, .channel_order,\n\n .array_size, .memory_layout };\n\nDescription\n\nQuery an attribute of a surface. Operand a is a .surfref variable or a .u64 register.\n\n\n\nQuery\n\nReturns\n\n.width\n\n.height\n\n.depth\n\nvalue in elements\n\n.channel_data_type\n\nUnsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.channel_order\n\nUnsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.array_size\n\nFor a surface array, number of surfaces in array, 0 otherwise.\n\n.memory_layout\n\n1 for surface with linear memory layout; 0 otherwise\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.5.\n\nChannel data type and channel order queries added in PTX ISA version 2.1.\n\nIndirect surface access introduced in PTX ISA version 3.1.\n\nThe .array_size query was added in PTX ISA version 4.1.\n\nThe .memory_layout query was added in PTX ISA version 4.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nIndirect surface access requires sm_20 or higher.\n\nExamples\n\nsuq.width.b32 %r1, [surf_A];\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq" }; case "sured": return { "html": "For more information, visit sured .

Surface Instructions: sured

\n\n\n

Reduce surface memory.

Syntax

sured.b.op.geom.ctype.clamp  [a,b],c; // byte addressing\nsured.p.op.geom.ctype.clamp  [a,b],c; // sample addressing\n\n.op    = { .add, .min, .max, .and, .or };\n.geom  = { .1d, .2d, .3d };\n.ctype = { .u32, .u64, .s32, .b32, .s64 };  // for sured.b\n.ctype = { .b32, .b64 };                    // for sured.p\n.clamp = { .trap, .clamp, .zero };\n

Description

Reduction to surface memory using a surface coordinate vector. The instruction performs a reduction\noperation with data from operand c to the surface named by operand a at coordinates given by\noperand b. Operand a is a .surfref variable or .u64 register. Operand b is a\nscalar or singleton tuple for 1d surfaces; is a two-element vector for 2d surfaces; and is a\nfour-element vector for 3d surfaces, where the fourth element is ignored. Coordinate elements are of\ntype .s32.

sured.b performs an unformatted reduction on .u32, .s32, .b32, .u64, or .s64\ndata. The lowest dimension coordinate represents a byte offset into the surface and is not\nscaled. Operation add applies to .u32, .u64, and .s32 types; min and max\napply to .u32, .s32, .u64 and .s64 types; operations and and or apply to\n.b32 type.

sured.p performs a reduction on sample-addressed data. The lowest dimension coordinate\nrepresents a sample offset rather than a byte offset. The instruction type .b64 is restricted to\nmin and max operations. For type .b32, the data is interpreted as .u32 or .s32\nbased on the surface sample format as follows: if the surface format contains UINT data, then\n.u32 is assumed; if the surface format contains SINT data, then .s32 is assumed. For\ntype .b64, if the surface format contains UINT data, then .u64 is assumed; if the\nsurface format contains SINT data, then .s64 is assumed.

The .clamp field specifies how to handle out-of-bounds addresses:

.trap: causes an execution trap on out-of-bounds addresses
\n
.clamp: stores data at the nearest surface location (sized appropriately)
\n
.zero: drops stores to out-of-bounds addresses
\n

Indirect surface access

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Indirect surface access introduced in PTX ISA version 3.1.

.u64/.s64/.b64 types with .min/.max operations introduced in PTX ISA version\n8.1.

Target ISA Notes

sured requires sm_20 or higher.

Indirect surface access requires sm_20 or higher.

.u64/.s64/.b64 types with .min/.max operations requires sm_50 or higher.

Examples

sured.b.add.2d.u32.trap  [surf_A, {x,y}], r1;\nsured.p.min.1d.u32.trap  [surf_B, {x}], r1;\nsured.b.max.1d.u64.trap  [surf_C, {x}], r1;\nsured.p.min.1d.b64.trap  [surf_D, {x}], r1;\n

", "tooltip": "Reduce surface memory.\n\nSyntax\n\nsured.b.op.geom.ctype.clamp [a,b],c; // byte addressing\n\nsured.p.op.geom.ctype.clamp [a,b],c; // sample addressing\n\n.op = { .add, .min, .max, .and, .or };\n\n.geom = { .1d, .2d, .3d };\n\n.ctype = { .u32, .u64, .s32, .b32, .s64 }; // for sured.b\n\n.ctype = { .b32, .b64 }; // for sured.p\n\n.clamp = { .trap, .clamp, .zero };\n\nDescription\n\nReduction to surface memory using a surface coordinate vector. The instruction performs a reduction\n\noperation with data from operand c to the surface named by operand a at coordinates given by\n\noperand b. Operand a is a .surfref variable or .u64 register. Operand b is a\n\nscalar or singleton tuple for 1d surfaces; is a two-element vector for 2d surfaces; and is a\n\nfour-element vector for 3d surfaces, where the fourth element is ignored. Coordinate elements are of\n\ntype .s32.\n\nsured.b performs an unformatted reduction on .u32, .s32, .b32, .u64, or .s64\n\ndata. The lowest dimension coordinate represents a byte offset into the surface and is not\n\nscaled. Operation add applies to .u32, .u64, and .s32 types; min and max\n\napply to .u32, .s32, .u64 and .s64 types; operations and and or apply to\n\n.b32 type.\n\nsured.p performs a reduction on sample-addressed data. The lowest dimension coordinate\n\nrepresents a sample offset rather than a byte offset. The instruction type .b64 is restricted to\n\nmin and max operations. For type .b32, the data is interpreted as .u32 or .s32\n\nbased on the surface sample format as follows: if the surface format contains UINT data, then\n\n.u32 is assumed; if the surface format contains SINT data, then .s32 is assumed. For\n\ntype .b64, if the surface format contains UINT data, then .u64 is assumed; if the\n\nsurface format contains SINT data, then .s64 is assumed.\n\nA surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\nThe .clamp field specifies how to handle out-of-bounds addresses:\n\n.trapcauses an execution trap on out-of-bounds addresses\n\n.clampstores data at the nearest surface location (sized appropriately)\n\n.zerodrops stores to out-of-bounds addresses\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nIndirect surface access introduced in PTX ISA version 3.1.\n\n.u64/.s64/.b64 types with .min/.max operations introduced in PTX ISA version\n\n8.1.\n\nTarget ISA Notes\n\nsured requires sm_20 or higher.\n\nIndirect surface access requires sm_20 or higher.\n\n.u64/.s64/.b64 types with .min/.max operations requires sm_50 or higher.\n\nExamples\n\nsured.b.add.2d.u32.trap [surf_A, {x,y}], r1;\n\nsured.p.min.1d.u32.trap [surf_B, {x}], r1;\n\nsured.b.max.1d.u64.trap [surf_C, {x}], r1;\n\nsured.p.min.1d.b64.trap [surf_D, {x}], r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured" }; case "sust": return { "html": "For more information, visit sust .

Surface Instructions: sust

\n\n\n

Store to surface memory.

Syntax

sust.b.{1d,2d,3d}{.cop}.vec.ctype.clamp  [a, b], c;  // unformatted\nsust.p.{1d,2d,3d}.vec.b32.clamp          [a, b], c;  // formatted\n\nsust.b.{a1d,a2d}{.cop}.vec.ctype.clamp   [a, b], c;  // unformatted\n\n.cop   = { .wb, .cg, .cs, .wt };                     // cache operation\n.vec   = { none, .v2, .v4 };\n.ctype = { .b8 , .b16, .b32, .b64 };\n.clamp = { .trap, .clamp, .zero };\n

Description

sust.{1d,2d,3d}

Store to surface memory using a surface coordinate vector. The instruction stores data from operand\nc to the surface named by operand a at coordinates given by operand b. Operand a is\na .surfref variable or .u64 register. Operand b is a scalar or singleton tuple for 1d\nsurfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d surfaces,\nwhere the fourth element is ignored. Coordinate elements are of type .s32.

sust.b performs an unformatted store of binary data. The lowest dimension coordinate represents\na byte offset into the surface and is not scaled. The size of the data transfer matches the size of\nsource operand c.

sust.p performs a formatted store of a vector of 32-bit data values to a surface sample. The\nsource vector elements are interpreted left-to-right as R, G, B, and A surface\ncomponents. These elements are written to the corresponding surface sample components. Source\nelements that do not occur in the surface sample are ignored. Surface sample components that do not\noccur in the source vector will be written with an unpredictable value. The lowest dimension\ncoordinate represents a sample offset rather than a byte offset.

The source data interpretation is based on the surface sample format as follows: If the surface\nformat contains UNORM, SNORM, or FLOAT data, then .f32 is assumed; if the surface\nformat contains UINT data, then .u32 is assumed; if the surface format contains SINT\ndata, then .s32 is assumed. The source data is then converted from this type to the surface\nsample format.

sust.b.{a1d,a2d}

Surface layer selection, followed by an unformatted store to the selected surface. The instruction\nfirst selects a surface layer from the surface array named by operand a using the index given by\nthe first element of the array coordinate vector b. The instruction then stores the data in\noperand c to the selected surface at coordinates given by the remaining elements of operand\nb. Operand a is a .surfref variable or .u64 register. Operand b is a bit-size type\nvector or tuple containing an index into the array of surfaces followed by coordinates within the\nselected surface, as follows:

For 1d surface arrays, operand b has type .v2.b32. The first element is interpreted as an\nunsigned integer index (.u32) into the surface array, and the second element is interpreted as\na 1d surface coordinate of type .s32.
For 2d surface arrays, operand b has type .v4.b32. The first element is interpreted as an\nunsigned integer index (.u32) into the surface array, and the next two elements are\ninterpreted as 2d surface coordinates of type .s32. The fourth element is ignored.

The .clamp field specifies how to handle out-of-bounds addresses:

.trap: causes an execution trap on out-of-bounds addresses
\n
.clamp: stores data at the nearest surface location (sized appropriately)
\n
.zero: drops stores to out-of-bounds addresses
\n

Indirect surface access

PTX ISA Notes

sust.b.trap introduced in PTX ISA version 1.5.\u00a0 sust.p, additional clamp modifiers, and\ncache operations introduced in PTX ISA version 2.0.

sust.b.3d and sust.b.{a1d,a2d} introduced in PTX ISA version 3.0.

Indirect surface access introduced in PTX ISA version 3.1.

Target ISA Notes

sust.b supported on all target architectures.

sm_1x targets support only the .trap clamping modifier.

sust.3d and sust.{a1d,a2d} require sm_20 or higher.

sust.p requires sm_20 or higher.

Indirect surface access requires sm_20 or higher.

Cache operations require sm_20 or higher.

Examples

sust.p.1d.v4.b32.trap  [surf_B, {x}], {f1,f2,f3,f4};\nsust.b.3d.v2.b64.trap  [surf_A, {x,y,z,w}], {r1,r2};\nsust.b.a1d.v2.b64      [surf_C, {idx,x}], {r1,r2};\nsust.b.a2d.b32         [surf_D, {idx,x,y,z}], r0;  // z ignored\n

", "tooltip": "Store to surface memory.\n\nSyntax\n\nsust.b.{1d,2d,3d}{.cop}.vec.ctype.clamp [a, b], c; // unformatted\n\nsust.p.{1d,2d,3d}.vec.b32.clamp [a, b], c; // formatted\n\nsust.b.{a1d,a2d}{.cop}.vec.ctype.clamp [a, b], c; // unformatted\n\n.cop = { .wb, .cg, .cs, .wt }; // cache operation\n\n.vec = { none, .v2, .v4 };\n\n.ctype = { .b8 , .b16, .b32, .b64 };\n\n.clamp = { .trap, .clamp, .zero };\n\nDescription\n\nsust.{1d,2d,3d}\n\nStore to surface memory using a surface coordinate vector. The instruction stores data from operand\n\nc to the surface named by operand a at coordinates given by operand b. Operand a is\n\na .surfref variable or .u64 register. Operand b is a scalar or singleton tuple for 1d\n\nsurfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d surfaces,\n\nwhere the fourth element is ignored. Coordinate elements are of type .s32.\n\nsust.b performs an unformatted store of binary data. The lowest dimension coordinate represents\n\na byte offset into the surface and is not scaled. The size of the data transfer matches the size of\n\nsource operand c.\n\nsust.p performs a formatted store of a vector of 32-bit data values to a surface sample. The\n\nsource vector elements are interpreted left-to-right as R, G, B, and A surface\n\ncomponents. These elements are written to the corresponding surface sample components. Source\n\nelements that do not occur in the surface sample are ignored. Surface sample components that do not\n\noccur in the source vector will be written with an unpredictable value. The lowest dimension\n\ncoordinate represents a sample offset rather than a byte offset.\n\nThe source data interpretation is based on the surface sample format as follows: If the surface\n\nformat contains UNORM, SNORM, or FLOAT data, then .f32 is assumed; if the surface\n\nformat contains UINT data, then .u32 is assumed; if the surface format contains SINT\n\ndata, then .s32 is assumed. The source data is then converted from this type to the surface\n\nsample format.\n\nsust.b.{a1d,a2d}\n\nSurface layer selection, followed by an unformatted store to the selected surface. The instruction\n\nfirst selects a surface layer from the surface array named by operand a using the index given by\n\nthe first element of the array coordinate vector b. The instruction then stores the data in\n\noperand c to the selected surface at coordinates given by the remaining elements of operand\n\nb. Operand a is a .surfref variable or .u64 register. Operand b is a bit-size type\n\nvector or tuple containing an index into the array of surfaces followed by coordinates within the\n\nselected surface, as follows:\n\nFor 1d surface arrays, operand b has type .v2.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the second element is interpreted as\n\na 1d surface coordinate of type .s32.\n\nFor 2d surface arrays, operand b has type .v4.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the next two elements are\n\ninterpreted as 2d surface coordinates of type .s32. The fourth element is ignored.\n\nA surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\nThe .clamp field specifies how to handle out-of-bounds addresses:\n\n.trapcauses an execution trap on out-of-bounds addresses\n\n.clampstores data at the nearest surface location (sized appropriately)\n\n.zerodrops stores to out-of-bounds addresses\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nsust.b.trap introduced in PTX ISA version 1.5.\u00a0 sust.p ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust" }; case "szext": return { "html": "For more information, visit szext(int) .

Integer Arithmetic Instructions: szext

\n\n\n

Sign-extend or Zero-extend.

Syntax

szext.mode.type  d, a, b;\n\n.mode = { .clamp, .wrap };\n.type = { .u32, .s32 };\n

Description

Sign-extends or zero-extends an N-bit value from operand a where N is specified in operand\nb. The resulting value is stored in the destination operand d.

For the .s32 instruction type, the value in a is treated as an N-bit signed value and the\nmost significant bit of this N-bit value is replicated up to bit 31. For the .u32 instruction\ntype, the value in a is treated as an N-bit unsigned number and is zero-extended to 32\nbits. Operand b is an unsigned 32-bit value.

If the value of N is 0, then the result of szext is 0. If the value of N is 32 or higher, then\nthe result of szext depends upon the value of the .mode qualifier as follows:

If .mode is .clamp, then the result is the same as the source operand a.
If .mode is .wrap, then the result is computed using the wrapped value of N.

Semantics

b1        = b & 0x1f;\ntoo_large = (b >= 32 && .mode == .clamp) ? true : false;\nmask      = too_large ? 0 : (~0) << b1;\nsign_pos  = (b1 - 1) & 0x1f;\n\nif (b1 == 0 || too_large || .type != .s32) {\n    sign_bit = false;\n} else {\n    sign_bit = (a >> sign_pos) & 1;\n}\nd = (a & ~mask) | (sign_bit ? mask | 0);\n

PTX ISA Notes

Introduced in PTX ISA version 7.6.

Target ISA Notes

szext requires sm_70 or higher.

Examples

szext.clamp.s32 rd, ra, rb;\nszext.wrap.u32  rd, 0xffffffff, 0; // Result is 0.\n

", "tooltip": "Sign-extend or Zero-extend.\n\nSyntax\n\nszext.mode.type d, a, b;\n\n.mode = { .clamp, .wrap };\n\n.type = { .u32, .s32 };\n\nDescription\n\nSign-extends or zero-extends an N-bit value from operand a where N is specified in operand\n\nb. The resulting value is stored in the destination operand d.\n\nFor the .s32 instruction type, the value in a is treated as an N-bit signed value and the\n\nmost significant bit of this N-bit value is replicated up to bit 31. For the .u32 instruction\n\ntype, the value in a is treated as an N-bit unsigned number and is zero-extended to 32\n\nbits. Operand b is an unsigned 32-bit value.\n\nIf the value of N is 0, then the result of szext is 0. If the value of N is 32 or higher, then\n\nthe result of szext depends upon the value of the .mode qualifier as follows:\n\nIf .mode is .clamp, then the result is the same as the source operand a.\n\nIf .mode is .wrap, then the result is computed using the wrapped value of N.\n\nSemantics\n\nb1 = b & 0x1f;\n\ntoo_large = (b >= 32 && .mode == .clamp) ? true : false;\n\nmask = too_large ? 0 : (~0) << b1;\n\nsign_pos = (b1 - 1) & 0x1f;\n\nif (b1 == 0 || too_large || .type != .s32) {\n\n sign_bit = false;\n\n} else {\n\n sign_bit = (a >> sign_pos) & 1;\n\n}\n\nd = (a & ~mask) | (sign_bit ? mask | 0);\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nszext requires sm_70 or higher.\n\nExamples\n\nszext.clamp.s32 rd, ra, rb;\n\nszext.wrap.u32 rd, 0xffffffff, 0; // Result is 0.\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext" }; case "tanh": return { "html": "For more information, visit tanh(fp) , tanh(fp16) .

Floating Point Instructions: tanh

\n\n\n

Find the hyperbolic tangent of a value (in radians)

Syntax

tanh.approx.f32 d, a;\n

Description

Take hyperbolic tangent value of a.

The operands d and a are of type .f32.

Semantics

d = tanh(a);\n

Notes

tanh.approx.f32 implements a fast approximation to FP32 hyperbolic-tangent.

Results of tanh for various corner-case inputs are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	-1.0
-subnormal	Same as input
-0.0	-0.0
+0.0	+0.0
+subnormal	Same as input
+Inf	1.0
NaN	NaN

The subnormal numbers are supported.

Note

The subnormal inputs gets passed through to the output since the value of tanh(x) for small\nvalues of x is approximately the same as x.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

Target ISA Notes

Requires sm_75 or higher.

Examples

tanh.approx.f32 sa, a;\n

Half Precision Floating Point Instructions: tanh

\n\n\n

Find the hyperbolic tangent of a value (in radians)

Syntax

tanh.approx.type d, a;\n\n.type = {.f16, .f16x2, .bf16, .bf16x2}\n

Description

Take hyperbolic tangent value of a.

The type of operands d and a are as specified by .type.

For .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\nparallel and the results are packed appropriately into a .f16x2 or .bf16x2.

Semantics

if (.type == .f16 || .type == .bf16) {\n  d = tanh(a)\n} else if (.type == .f16x2 || .type == .bf16x2) {\n  fA[0] = a[0:15];\n  fA[1] = a[16:31];\n  d[0] = tanh(fA[0])\n  d[1] = tanh(fA[1])\n}\n

Notes

tanh.approx.{f16, f16x2, bf16, bf16x2} implements an approximate hyperbolic tangent in the\ntarget format.

Results of tanh for various corner-case inputs are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Input	Result
-Inf	-1.0
-0.0	-0.0
+0.0	+0.0
+Inf	1.0
NaN	NaN

The maximum absolute error for .f16 type is 2-10.987. The maximum absolute error for .bf16\ntype is 2-8.

The subnormal numbers are supported.

PTX ISA Notes

Introduced in PTX ISA version 7.0.

tanh.approx.{bf16/bf16x2} introduced in PTX ISA version 7.8.

Target ISA Notes

Requires sm_75 or higher.

tanh.approx.{bf16/bf16x2} requires sm_90 or higher.

Examples

tanh.approx.f16    h1, h0;\ntanh.approx.f16x2  hd1, hd0;\ntanh.approx.bf16   b1, b0;\ntanh.approx.bf16x2 hb1, hb0;\n

", "tooltip": "=====Floating Point Instructions: tanh\n\n\n\nFind the hyperbolic tangent of a value (in radians)\n\nSyntax\n\ntanh.approx.f32 d, a;\n\nDescription\n\nTake hyperbolic tangent value of a.\n\nThe operands d and a are of type .f32.\n\nSemantics\n\nd = tanh(a);\n\nNotes\n\ntanh.approx.f32 implements a fast approximation to FP32 hyperbolic-tangent.\n\nResults of tanh for various corner-case inputs are as follows:\n\n\n\nInput\n\nResult\n\n-Inf\n\n-1.0\n\n-subnormal\n\nSame as input\n\n-0.0\n\n-0.0\n\n+0.0\n\n...\n\n=====Half Precision Floating Point Instructions: tanh\n\n\n\nFind the hyperbolic tangent of a value (in radians)\n\nSyntax\n\ntanh.approx.type d, a;\n\n.type = {.f16, .f16x2, .bf16, .bf16x2}\n\nDescription\n\nTake hyperbolic tangent value of a.\n\nThe type of operands d and a are as specified by .type.\n\nFor .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\n\nparallel and the results are packed appropriately into a .f16x2 or .bf16x2.\n\nSemantics\n\nif... ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh" }; case "target": return { "html": "For more information, visit target .

PTX Module Directives: .target

\n\n\n

Architecture and Platform target.

Syntax

.target stringlist         // comma separated list of target specifiers\nstring = { sm_90a, sm_90,               // sm_9x target architectures\n           sm_80, sm_86, sm_87, sm_89,  // sm_8x target architectures\n           sm_70, sm_72, sm_75,         // sm_7x target architectures\n           sm_60, sm_61, sm_62,         // sm_6x target architectures\n           sm_50, sm_52, sm_53,         // sm_5x target architectures\n           sm_30, sm_32, sm_35, sm_37,  // sm_3x target architectures\n           sm_20,                       // sm_2x target architectures\n           sm_10, sm_11, sm_12, sm_13,  // sm_1x target architectures\n           texmode_unified, texmode_independent,   // texturing mode\n           debug,                                  // platform option\n           map_f64_to_f32 };                       // platform option\n

Description

Specifies the set of features in the target architecture for which the current PTX code was\ngenerated. In general, generations of SM architectures follow an onion layer model, where each\ngeneration adds new features and retains all features of previous generations. The onion layer model\nallows the PTX code generated for a given target to be run on later generation devices.

Target architectures with suffix \u201ca\u201d, such as sm_90a, include architecture-accelerated\nfeatures that are supported on the specified architecture only, hence such targets do not follow the\nonion layer model. Therefore, PTX code generated for such targets cannot be run on later generation\ndevices. Architecture-accelerated features can only be used with targets that support these\nfeatures.

Semantics

Each PTX module must begin with a .version directive, immediately followed by a .target\ndirective containing a target architecture and optional platform options. A .target directive\nspecifies a single target architecture, but subsequent .target directives can be used to change\nthe set of target features allowed during parsing. A program with multiple .target directives\nwill compile and run only on devices that support all features of the highest-numbered architecture\nlisted in the program.

PTX features are checked against the specified target architecture, and an error is generated if an\nunsupported feature is used.\u00a0 The following table summarizes the features in PTX that vary according\nto target architecture.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_90`	Baseline feature set for `sm_90` architecture.
`sm_90a`	Adds support for `sm_90a` accelerated `wgmma` and `setmaxnreg` instructions.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_80`	Baseline feature set for `sm_80` architecture.
`sm_86`	Adds support for `.xorsign` modifier on `min` and `max` instructions.
`sm_87`	Baseline feature set for `sm_86` architecture.
`sm_89`	Baseline feature set for `sm_86` architecture.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_70`	Baseline feature set for `sm_70` architecture.
`sm_72`	Adds support for integer multiplicand and accumulator matrices in `wmma` instructions. \n Adds support for `cvt.pack` instruction. \n
`sm_75`	Adds support for sub-byte integer and single-bit multiplicant matrices in `wmma` instructions. \n Adds support for `ldmatrix` instruction. \n Adds support for `movmatrix` instruction. \n Adds support for `tanh` instruction. \n

Target

Description

sm_70

Baseline feature set for sm_70 architecture.

sm_72

Adds support for integer multiplicand and accumulator matrices in wmma instructions.

Adds support for cvt.pack instruction.

sm_75

Adds support for sub-byte integer and single-bit multiplicant matrices in wmma instructions.

Adds support for ldmatrix instruction.

Adds support for movmatrix instruction.

Adds support for tanh instruction.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_60`	Baseline feature set for `sm_60` architecture.
`sm_61`	Adds support for `dp2a` and `dp4a` instructions.
`sm_62`	Baseline feature set for `sm_61` architecture.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_50`	Baseline feature set for `sm_50` architecture.
`sm_52`	Baseline feature set for `sm_50` architecture.
`sm_53`	Adds support for arithmetic, comparsion and texture instructions for `.f16` and `.f16x2` types.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_30`	Baseline feature set for `sm_30` architecture.
`sm_32`	Adds 64-bit `{atom,red}.{and,or,xor,min,max}`\ninstructions. \n Adds `shf` instruction. \n Adds `ld.global.nc` instruction. \n
`sm_35`	Adds support for CUDA Dynamic Parallelism.
`sm_37`	Baseline feature set for `sm_35` architecture.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_20`	Baseline feature set for `sm_20` architecture.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target	Description
`sm_10`	Baseline feature set for `sm_10` architecture. \n Requires `map_f64_to_f32` if any `.f64` instructions used. \n
`sm_11`	Adds 64-bit `{atom,red}.{and,or,xor,min,max}` instructions. \n Requires `map_f64_to_f32` if any `.f64` instructions used. \n
`sm_12`	Adds `{atom,red}.shared`, 64-bit `{atom,red}.global`, `vote`\ninstructions. \n Requires `map_f64_to_f32` if any `.f64` instructions used. \n
`sm_13`	Adds double-precision support, including expanded rounding modifiers. \n Disallows use of `map_f64_to_f32`. \n

The texturing mode is specified for an entire module and cannot be changed within the module.

The .target debug option declares that the PTX file contains DWARF debug information, and\nsubsequent compilation of PTX will retain information needed for source-level debugging. If the\ndebug option is declared, an error message is generated if no DWARF information is found in the\nfile. The debug option requires PTX ISA version 3.0 or later.

map_f64_to_f32 indicates that all double-precision instructions map to single-precision\nregardless of the target architecture. This enables high-level language compilers to compile\nprograms containing type double to target device that do not support double-precision\noperations. Note that .f64 storage remains as 64-bits, with only half being used by instructions\nconverted from .f64 to .f32.

Notes

Targets of the form compute_xx are also accepted as synonyms for sm_xx targets.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target strings sm_10 and sm_11 introduced in PTX ISA version 1.0.

Target strings sm_12 and sm_13 introduced in PTX ISA version 1.2.

Texturing mode introduced in PTX ISA version 1.5.

Target string sm_20 introduced in PTX ISA version 2.0.

Target string sm_30 introduced in PTX ISA version 3.0.

Platform option debug introduced in PTX ISA version 3.0.

Target string sm_35 introduced in PTX ISA version 3.1.

Target strings sm_32 and sm_50 introduced in PTX ISA version 4.0.

Target strings sm_37 and sm_52 introduced in PTX ISA version 4.1.

Target string sm_53 introduced in PTX ISA version 4.2.

Target string sm_60, sm_61, sm_62 introduced in PTX ISA version 5.0.

Target string sm_70 introduced in PTX ISA version 6.0.

Target string sm_72 introduced in PTX ISA version 6.1.

Target string sm_75 introduced in PTX ISA version 6.3.

Target string sm_80 introduced in PTX ISA version 7.0.

Target string sm_86 introduced in PTX ISA version 7.1.

Target string sm_87 introduced in PTX ISA version 7.4.

Target string sm_89 introduced in PTX ISA version 7.8.

Target string sm_90 introduced in PTX ISA version 7.8.

Target string sm_90a introduced in PTX ISA version 8.0.

Target ISA Notes

The .target directive is supported on all target architectures.

Examples

.target sm_10       // baseline target architecture\n.target sm_13       // supports double-precision\n.target sm_20, texmode_independent\n.target sm_90       // baseline target architecture\n.target sm_90a      // PTX using arch accelerated features\n

", "tooltip": "Architecture and Platform target.\n\nSyntax\n\n.target stringlist // comma separated list of target specifiers\n\nstring = { sm_90a, sm_90, // sm_9x target architectures\n\n sm_80, sm_86, sm_87, sm_89, // sm_8x target architectures\n\n sm_70, sm_72, sm_75, // sm_7x target architectures\n\n sm_60, sm_61, sm_62, // sm_6x target architectures\n\n sm_50, sm_52, sm_53, // sm_5x target architectures\n\n sm_30, sm_32, sm_35, sm_37, // sm_3x target architectures\n\n sm_20, // sm_2x target architectures\n\n sm_10, sm_11, sm_12, sm_13, // sm_1x target architectures\n\n texmode_unified, texmode_independent, // texturing mode\n\n debug, // platform option\n\n map_f64_to_f32 }; // platform option\n\nDescription\n\nSpecifies the set of features in the target architecture for which the current PTX code was\n\ngenerated. In general, generations of SM architectures follow an onion layer model, where each\n\ngeneration adds new features and retains all features of previous generations. The onion layer model\n\nallows the PTX code generated for a given target to be run on later generation devices.\n\nTarget architectures with suffix \u201ca\u201d, such as sm_90a, include architecture-accelerated\n\nfeatures that are supported on the specified architecture only, hence such targets do not follow the\n\nonion layer model. Therefore, PTX code generated for such targets cannot be run on later generation\n\ndevices. Architecture-accelerated features can only be used with targets that support these\n\nfeatures.\n\nSemantics\n\nEach PTX module must begin with a .version directive, immediately followed by a .target\n\ndirective containing a target architecture and optional platform options. A .target directive\n\nspecifies a single target architecture, but subsequent .target directives can be used to change\n\nthe set of target features allowed during parsing. A program with multiple .target directives\n\nwill compile and run only on devices that support all features of the highest-numbered architecture\n\nlisted in the program.\n\nPTX features are checked against the specified target architecture, and an error is generated if an\n\nunsupported feature is used.\u00a0 The following table summarizes the features in PTX that vary according\n\nto target architecture.\n\n\n\nTarget\n\nDescription\n\nsm_90\n\nBaseline feature set for sm_90 architecture.\n\nsm_90a\n\nAdds support for sm_90a accelerated wgmma and setmaxnreg instructions.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_80\n\nBaseline feature set for sm_80 architecture.\n\nsm_86\n\nAdds support for .xorsign modifier on min and max instructions.\n\nsm_87\n\nBaseline feature set for sm_86 architecture.\n\nsm_89\n\nBaseline feature set for sm_86 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_70\n\nBaseline feature set for sm_70 architecture.\n\nsm_72\n\nAdds support for integer multiplicand and accumulator matrices in wmma instructions.\n\nAdds support for cvt.pack instruction.\n\nsm_75\n\nAdds support for sub-byte integer and single-bit multiplicant matrices in wmma instructions.\n\nAdds support for ldmatrix instruction.\n\nAdds support for movmatrix instruction.\n\nAdds support for tanh instruction.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_60\n\nBaseline feature set for sm_60 architecture.\n\nsm_61\n\nAdds support for dp2a and dp4a instructions.\n\nsm_62\n\nBaseline feature set for sm_61 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_50\n\nBaseline feature set for sm_50 architecture.\n\nsm_52\n\nBaseline feature set for sm_50 architecture.\n\nsm_53\n\nAdds support for arithmetic, comparsion and texture instructions for .f16 and .f16x2 types.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_30\n\nBaseline feature set for sm_30 architecture.\n\nsm_32\n\nAdds 64-bit {atom,red}.{and,or,xor,min,max}\n\ninstructions.\n\nAdds shf instruction.\n\nAdds ld.global.nc instruction.\n\nsm_35\n\nAdds support for CUDA Dynamic Parallelism.\n\nsm_37\n\nBaseline feature set for sm_35 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_20\n\nBaseline feature set for sm_20 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_10\n\nBaseline feature se ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-target" }; case "testp": return { "html": "For more information, visit testp(fp) .

Floating Point Instructions: testp

\n\n\n

Test floating-point property.

Syntax

testp.op.type  p, a;  // result is .pred\n\n.op   = { .finite, .infinite,\n          .number, .notanumber,\n          .normal, .subnormal };\n.type = { .f32, .f64 };\n

Description

testp tests common properties of floating-point numbers and returns a predicate value of 1\nif True and 0 if False.

testp.finite: True if the input is not infinite or NaN
\n
testp.infinite: True if the input is positive or negative infinity
\n
testp.number: True if the input is not NaN
\n
testp.notanumber: True if the input is NaN
\n
testp.normal: True if the input is a normal number (not NaN, not infinity)
\n
testp.subnormal: True if the input is a subnormal number (not NaN, not infinity)
\n

As a special case, positive and negative zero are considered normal numbers.

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

Requires sm_20 or higher.

Examples

testp.notanumber.f32  isnan, f0;\ntestp.infinite.f64    p, X;\n

", "tooltip": "Test floating-point property.\n\nSyntax\n\ntestp.op.type p, a; // result is .pred\n\n.op = { .finite, .infinite,\n\n .number, .notanumber,\n\n .normal, .subnormal };\n\n.type = { .f32, .f64 };\n\nDescription\n\ntestp tests common properties of floating-point numbers and returns a predicate value of 1\n\nif True and 0 if False.\n\ntestp.finiteTrue if the input is not infinite or NaN\n\ntestp.infiniteTrue if the input is positive or negative infinity\n\ntestp.numberTrue if the input is not NaN\n\ntestp.notanumberTrue if the input is NaN\n\ntestp.normalTrue if the input is a normal number (not NaN, not infinity)\n\ntestp.subnormalTrue if the input is a subnormal number (not NaN, not infinity)\n\nAs a special case, positive and negative zero are considered normal numbers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ntestp.notanumber.f32 isnan, f0;\n\ntestp.infinite.f64 p, X;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp" }; case "tex": return { "html": "For more information, visit tex .

Texture Instructions: tex

\n\n\n

Perform a texture memory lookup.

Syntax

tex.geom.v4.dtype.ctype  d, [a, c] {, e} {, f};\ntex.geom.v4.dtype.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\ntex.geom.v2.f16x2.ctype  d[|p], [a, c] {, e} {, f};\ntex.geom.v2.f16x2.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\n// mipmaps\ntex.base.geom.v4.dtype.ctype   d[|p], [a, {b,} c] {, e} {, f};\ntex.level.geom.v4.dtype.ctype  d[|p], [a, {b,} c], lod {, e} {, f};\ntex.grad.geom.v4.dtype.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\ntex.base.geom.v2.f16x2.ctype   d[|p], [a, {b,} c] {, e} {, f};\ntex.level.geom.v2.f16x2.ctype  d[|p], [a, {b,} c], lod {, e} {, f};\ntex.grad.geom.v2.f16x2.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\n.geom  = { .1d, .2d, .3d, .a1d, .a2d, .cube, .acube, .2dms, .a2dms };\n.dtype = { .u32, .s32, .f16,  .f32 };\n.ctype = {       .s32, .f32 };          // .cube, .acube require .f32\n                                        // .2dms, .a2dms require .s32\n

Description

tex.{1d,2d,3d}

Texture lookup using a texture coordinate vector. The instruction loads data from the texture named\nby operand a at coordinates given by operand c into destination d. Operand c is a\nscalar or singleton tuple for 1d textures; is a two-element vector for 2d textures; and is a\nfour-element vector for 3d textures, where the fourth element is ignored. An optional texture\nsampler b may be specified. If no sampler is specified, the sampler behavior is a property of\nthe named texture. The optional destination predicate p is set to True if data from texture\nat specified coordinates is resident in memory, False otherwise. When optional destination\npredicate p is set to False, data loaded will be all zeros. Memory residency of Texture Data\nat specified coordinates is dependent on execution environment setup using Driver API calls, prior\nto kernel launch. Refer to Driver API documentation for more details including any\nsystem/implementation specific behavior.

An optional operand e may be specified. Operand e is a vector of .s32 values that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\nvalue is in the range of -8 to +7. Operand e is a singleton tuple for 1d textures; is a two\nelement vector 2d textures; and is four-element vector for 3d textures, where the fourth element is\nignored.

An optional operand f may be specified for depth textures. Depth textures are special type\nof textures which hold data from the depth buffer. Depth buffer contains depth information of each\npixel. Operand f is .f32 scalar value that specifies depth compare value for depth\ntextures. Each element fetched from texture is compared against value given in f operand. If\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\nused for the filtering. When using depth compare operand, the elements in texture coordinate vector\nc have .f32 type.

Depth compare operand is not supported for 3d textures.

The instruction returns a two-element vector for destination type .f16x2. For all other\ndestination types, the instruction returns a four-element vector. Coordinates may be given in either\nsigned 32-bit integer or 32-bit floating point form.

A texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.

tex.{a1d,a2d}

Texture array selection, followed by texture lookup. The instruction first selects a texture from\nthe texture array named by operand a using the index given by the first element of the array\ncoordinate vector c. The instruction then loads data from the selected texture at coordinates\ngiven by the remaining elements of operand c into destination d. Operand c is a bit-size\ntype vector or tuple containing an index into the array of textures followed by coordinates within\nthe selected texture, as follows:

For 1d texture arrays, operand c has type .v2.b32. The first element is interpreted as an\nunsigned integer index (.u32) into the texture array, and the second element is interpreted as\na 1d texture coordinate of type .ctype.
For 2d texture arrays, operand c has type .v4.b32. The first element is interpreted as an\nunsigned integer index (.u32) into the texture array, and the next two elements are\ninterpreted as 2d texture coordinates of type .ctype. The fourth element is ignored.

An optional texture sampler b may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.

An optional operand e may be specified. Operand e is a vector of .s32 values that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\nvalue is in the range of -8 to +7. Operand e is a singleton tuple for 1d texture arrays; and is\na two element vector 2d texture arrays.

An optional operand f may be specified for depth textures arrays. Operand f is .f32\nscalar value that specifies depth compare value for depth textures. When using depth compare\noperand, the coordinates in texture coordinate vector c have .f32 type.

The instruction returns a two-element vector for destination type .f16x2. For all other\ndestination types, the instruction returns a four-element vector. The texture array index is a\n32-bit unsigned integer, and texture coordinate elements are 32-bit signed integer or floating point\nvalues.

The optional destination predicate p is set to True if data from texture at specified\ncoordinates is resident in memory, False otherwise. When optional destination predicate p is\nset to False, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.

tex.cube

Cubemap texture lookup. The instruction loads data from the cubemap texture named by operand a\nat coordinates given by operand c into destination d. Cubemap textures are special\ntwo-dimensional layered textures consisting of six layers that represent the faces of a cube. All\nlayers in a cubemap are of the same size and are square (i.e., width equals height).

When accessing a cubemap, the texture coordinate vector c has type .v4.f32, and comprises\nthree floating-point coordinates (s, t, r) and a fourth padding argument which is\nignored. Coordinates (s, t, r) are projected onto one of the six cube faces. The (s,\nt, r) coordinates can be thought of as a direction vector emanating from the center of the\ncube. Of the three coordinates (s, t, r), the coordinate of the largest magnitude (the\nmajor axis) selects the cube face. Then, the other two coordinates (the minor axes) are divided by\nthe absolute value of the major axis to produce a new (s, t) coordinate pair to lookup into\nthe selected cube face.

An optional texture sampler b may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.

Offset vector operand e is not supported for cubemap textures.

an optional operand f may be specified for cubemap depth textures. operand f is .f32\nscalar value that specifies depth compare value for cubemap depth textures.

tex.acube

Cubemap array selection, followed by cubemap lookup. The instruction first selects a cubemap texture\nfrom the cubemap array named by operand a using the index given by the first element of the\narray coordinate vector c. The instruction then loads data from the selected cubemap texture at\ncoordinates given by the remaining elements of operand c into destination d.

Cubemap array textures consist of an array of cubemaps, i.e., the total number of layers is a\nmultiple of six. When accessing a cubemap array texture, the coordinate vector c has type\n.v4.b32. The first element is interpreted as an unsigned integer index (.u32) into the\ncubemap array, and the remaining three elements are interpreted as floating-point cubemap\ncoordinates (s, t, r), used to lookup in the selected cubemap as described above.

An optional texture sampler b may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.

Offset vector operand e is not supported for cubemap texture arrays.

An optional operand f may be specified for cubemap depth texture arrays. Operand f is\n.f32 scalar value that specifies depth compare value for cubemap depth textures.

tex.2dms

Multi-sample texture lookup using a texture coordinate vector. Multi-sample textures consist of\nmultiple samples per data element. The instruction loads data from the texture named by operand\na from sample number given by first element of the operand c, at coordinates given by\nremaining elements of operand c into destination d. When accessing a multi-sample texture,\ntexture coordinate vector c has type .v4.b32. The first element in operand c is\ninterpreted as unsigned integer sample number (.u32), and the next two elements are interpreted\nas signed integer (.s32) 2d texture coordinates. The fourth element is ignored. An optional\ntexture sampler b may be specified. If no sampler is specified, the sampler behavior is a\nproperty of the named texture.

An optional operand e may be specified. Operand e is a vector of type .v2.s32 that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\nvalue is in the range of -8 to +7.

Depth compare operand f is not supported for multi-sample textures.

tex.a2dms

Multi-sample texture array selection, followed by multi-sample texture lookup. The instruction first\nselects a multi-sample texture from the multi-sample texture array named by operand a using the\nindex given by the first element of the array coordinate vector c. The instruction then loads\ndata from the selected multi-sample texture from sample number given by second element of the\noperand c, at coordinates given by remaining elements of operand c into destination\nd. When accessing a multi-sample texture array, texture coordinate vector c has type\n.v4.b32. The first element in operand c is interpreted as unsigned integer sampler number, the\nsecond element is interpreted as unsigned integer index (.u32) into the multi-sample texture\narray and the next two elements are interpreted as signed integer (.s32) 2d texture\ncoordinates. An optional texture sampler b may be specified. If no sampler is specified, the\nsampler behavior is a property of the named texture.

An optional operand e may be specified. Operand e is a vector of type .v2.s32 values\nthat specifies coordinate offset. Offset is applied to coordinates before doing texture\nlookup. Offset value is in the range of -8 to +7.

Depth compare operand f is not supported for multi-sample texture arrays.

Mipmaps

.base (lod zero): Pick level 0 (base level). This is the default if no mipmap mode is specified. No additional arguments.
\n
.level (lod explicit): Requires an additional 32-bit scalar argument, lod, which contains the LOD to fetch from. The\ntype of lod follows .ctype (either .s32 or .f32). Geometries .2dms and\n.a2dms are not supported in this mode.
\n
.grad (lod gradient): Requires two .f32 vectors, dPdx and dPdy, that specify the partials. The vectors are\nsingletons for 1d and a1d textures; are two-element vectors for 2d and a2d textures; and are\nfour-element vectors for 3d, cube and acube textures, where the fourth element is ignored for 3d\nand cube geometries. Geometries .2dms and .a2dms are not supported in this mode.
\n

For mipmap texture lookup, an optional operand e may be specified. Operand e is a vector of\n.s32 that specifies coordinate offset. Offset is applied to coordinates before doing texture\nlookup. Offset value is in the range of -8 to +7. Offset vector operand is not supported for cube\nand cubemap geometries.

An optional operand f may be specified for mipmap textures. Operand f is .f32 scalar\nvalue that specifies depth compare value for depth textures. When using depth compare operand, the\ncoordinates in texture coordinate vector c have .f32 type.

Depth compare operand is not supported for 3d textures.

Indirect texture access

Beginning with PTX ISA version 3.1, indirect texture access is supported in unified mode for target\narchitecture sm_20 or higher. In indirect access, operand a is a .u64 register holding\nthe address of a .texref variable.

Notes

For compatibility with prior versions of PTX, the square brackets are not required and .v4\ncoordinate vectors are allowed for any geometry, with the extra elements being ignored.

PTX ISA Notes

Unified mode texturing introduced in PTX ISA version 1.0. Extension using opaque .texref and\n.samplerref types and independent mode texturing introduced in PTX ISA version 1.5.

Texture arrays tex.{a1d,a2d} introduced in PTX ISA version 2.3.

Cubemaps and cubemap arrays introduced in PTX ISA version 3.0.

Support for mipmaps introduced in PTX ISA version 3.1.

Indirect texture access introduced in PTX ISA version 3.1.

Multi-sample textures and multi-sample texture arrays introduced in PTX ISA version 3.2.

Support for textures returning .f16 and .f16x2 data introduced in PTX ISA version 4.2.

Support for tex.grad.{cube, acube} introduced in PTX ISA version 4.3.

Offset vector operand introduced in PTX ISA version 4.3.

Depth compare operand introduced in PTX ISA version 4.3.

Support for optional destination predicate introduced in PTX ISA version 7.1.

Target ISA Notes

Supported on all target architectures.

The cubemap array geometry (.acube) requires sm_20 or higher.

Mipmaps require sm_20 or higher.

Indirect texture access requires sm_20 or higher.

Multi-sample textures and multi-sample texture arrays require sm_30 or higher.

Texture fetch returning .f16 and .f16x2 data require sm_53 or higher.

tex.grad.{cube, acube} requires sm_20 or higher.

Offset vector operand requires sm_30 or higher.

Depth compare operand requires sm_30 or higher.

Support for optional destination predicate requires sm_60 or higher.

Examples

 // Example of unified mode texturing\n // - f4 is required to pad four-element tuple and is ignored\n tex.3d.v4.s32.s32  {r1,r2,r3,r4}, [tex_a,{f1,f2,f3,f4}];\n\n // Example of independent mode texturing\n tex.1d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,smpl_x,{f1}];\n\n // Example of 1D texture array, independent texturing mode\n tex.a1d.v4.s32.s32 {r1,r2,r3,r4}, [tex_a,smpl_x,{idx,s1}];\n\n // Example of 2D texture array, unified texturing mode\n // - f3 is required to pad four-element tuple and is ignored\n tex.a2d.v4.s32.f32 {r1,r2,r3,r4}, [tex_a,{idx,f1,f2,f3}];\n\n // Example of cubemap array, unified textureing mode\n tex.acube.v4.f32.f32 {r0,r1,r2,r3}, [tex_cuarray,{idx,f1,f2,f3}];\n\n // Example of multi-sample texture, unified texturing mode\n tex.2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ms,{sample,r6,r7,r8}];\n\n // Example of multi-sample texture, independent texturing mode\n tex.2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ms, smpl_x,{sample,r6,r7,r8}];\n\n // Example of multi-sample texture array, unified texturing mode\n tex.a2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ams,{idx,sample,r6,r7}];\n\n // Example of texture returning .f16 data\n tex.1d.v4.f16.f32  {h1,h2,h3,h4}, [tex_a,smpl_x,{f1}];\n\n // Example of texture returning .f16x2 data\n tex.1d.v2.f16x2.f32  {h1,h2}, [tex_a,smpl_x,{f1}];\n\n // Example of 3d texture array access with tex.grad,unified texturing mode\n tex.grad.3d.v4.f32.f32 {%f4,%f5,%f6,%f7},[tex_3d,{%f0,%f0,%f0,%f0}],\n                 {fl0,fl1,fl2,fl3},{fl0,fl1,fl2,fl3};\n\n// Example of cube texture array access with tex.grad,unified texturing mode\n tex.grad.cube.v4.f32.f32{%f4,%f5,%f6,%f7},[tex_cube,{%f0,%f0,%f0,%f0}],\n                 {fl0,fl1,fl2,fl3},{fl0,fl1,fl2,fl3};\n\n // Example of 1d texture lookup with offset, unified texturing mode\n tex.1d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a, {f1}], {r5};\n\n // Example of 2d texture array lookup with offset, unified texturing mode\n tex.a2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{idx,f1,f2}], {f5,f6};\n\n // Example of 2d mipmap texture lookup with offset, unified texturing mode\n tex.level.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}],\n                          flvl, {r7, r8};\n\n // Example of 2d depth texture lookup with compare, unified texturing mode\n tex.1d.v4.f32.f32  {f1,f2,f3,f4}, [tex_a, {f1}], f0;\n\n // Example of depth 2d texture array lookup with offset, compare\n tex.a2d.v4.s32.f32  {f0,f1,f2,f3}, [tex_a,{idx,f4,f5}], {r5,r6}, f6;\n\n // Example of destination predicate use\n tex.3d.v4.s32.s32 {r1,r2,r3,r4}|p, [tex_a,{f1,f2,f3,f4}];\n

", "tooltip": "Perform a texture memory lookup.\n\nSyntax\n\ntex.geom.v4.dtype.ctype d, [a, c] {, e} {, f};\n\ntex.geom.v4.dtype.ctype d[|p], [a, b, c] {, e} {, f}; // explicit sampler\n\ntex.geom.v2.f16x2.ctype d[|p], [a, c] {, e} {, f};\n\ntex.geom.v2.f16x2.ctype d[|p], [a, b, c] {, e} {, f}; // explicit sampler\n\n// mipmaps\n\ntex.base.geom.v4.dtype.ctype d[|p], [a, {b,} c] {, e} {, f};\n\ntex.level.geom.v4.dtype.ctype d[|p], [a, {b,} c], lod {, e} {, f};\n\ntex.grad.geom.v4.dtype.ctype d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\ntex.base.geom.v2.f16x2.ctype d[|p], [a, {b,} c] {, e} {, f};\n\ntex.level.geom.v2.f16x2.ctype d[|p], [a, {b,} c], lod {, e} {, f};\n\ntex.grad.geom.v2.f16x2.ctype d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\n.geom = { .1d, .2d, .3d, .a1d, .a2d, .cube, .acube, .2dms, .a2dms };\n\n.dtype = { .u32, .s32, .f16, .f32 };\n\n.ctype = { .s32, .f32 }; // .cube, .acube require .f32\n\n // .2dms, .a2dms require .s32\n\nDescription\n\ntex.{1d,2d,3d}\n\nTexture lookup using a texture coordinate vector. The instruction loads data from the texture named\n\nby operand a at coordinates given by operand c into destination d. Operand c is a\n\nscalar or singleton tuple for 1d textures; is a two-element vector for 2d textures; and is a\n\nfour-element vector for 3d textures, where the fourth element is ignored. An optional texture\n\nsampler b may be specified. If no sampler is specified, the sampler behavior is a property of\n\nthe named texture. The optional destination predicate p is set to True if data from texture\n\nat specified coordinates is resident in memory, False otherwise. When optional destination\n\npredicate p is set to False, data loaded will be all zeros. Memory residency of Texture Data\n\nat specified coordinates is dependent on execution environment setup using Driver API calls, prior\n\nto kernel launch. Refer to Driver API documentation for more details including any\n\nsystem/implementation specific behavior.\n\nAn optional operand e may be specified. Operand e is a vector of .s32 values that\n\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\n\nvalue is in the range of -8 to +7. Operand e is a singleton tuple for 1d textures; is a two\n\nelement vector 2d textures; and is four-element vector for 3d textures, where the fourth element is\n\nignored.\n\nAn optional operand f may be specified for depth textures. Depth textures are special type\n\nof textures which hold data from the depth buffer. Depth buffer contains depth information of each\n\npixel. Operand f is .f32 scalar value that specifies depth compare value for depth\n\ntextures. Each element fetched from texture is compared against value given in f operand. If\n\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\n\nused for the filtering. When using depth compare operand, the elements in texture coordinate vector\n\nc have .f32 type.\n\nDepth compare operand is not supported for 3d textures.\n\nThe instruction returns a two-element vector for destination type .f16x2. For all other\n\ndestination types, the instruction returns a four-element vector. Coordinates may be given in either\n\nsigned 32-bit integer or 32-bit floating point form.\n\nA texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\ntex.{a1d,a2d}\n\nTexture array selection, followed by texture lookup. The instruction first selects a texture from\n\nthe texture array named by operand a using the index given by the first element of the array\n\ncoordinate vector c. The instruction then loads data from the selected texture at coordinates\n\ngiven by the remaining elements of operand c into destination d. Operand c is a bit-size\n\ntype vect ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex" }; case "tid": return { "html": "For more information, visit tid .

Special Registers: %tid

\n\n\n

Thread identifier within a CTA.

Syntax (predefined)

.sreg .v4 .u32 %tid;                  // thread id vector\n.sreg .u32 %tid.x, %tid.y, %tid.z;    // thread id components\n

Description

A predefined, read-only, per-thread special register initialized with the thread identifier within\nthe CTA. The %tid special register contains a 1D, 2D, or 3D vector to match the CTA shape; the\n%tid value in unused dimensions is 0. The fourth element is unused and always returns\nzero. The number of threads in each dimension are specified by the predefined special register\n%ntid.

Every thread in the CTA has a unique %tid.

%tid component values range from 0 through %ntid-1 in each CTA dimension.

%tid.y == %tid.z == 0 in 1D CTAs. %tid.z == 0 in 2D CTAs.

It is guaranteed that:

0  <=  %tid.x <  %ntid.x\n0  <=  %tid.y <  %ntid.y\n0  <=  %tid.z <  %ntid.z\n

PTX ISA Notes

Introduced in PTX ISA version 1.0 with type .v4.u16.

Redefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n%tid.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u32      %r1,%tid.x;  // move tid.x to %rh\n\n// legacy code accessing 16-bit components of %tid\nmov.u16      %rh,%tid.x;\ncvt.u32.u16  %r2,%tid.z;  // zero-extend tid.z to %r2\n

", "tooltip": "Thread identifier within a CTA.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %tid; // thread id vector\n\n.sreg .u32 %tid.x, %tid.y, %tid.z; // thread id components\n\nDescription\n\nA predefined, read-only, per-thread special register initialized with the thread identifier within\n\nthe CTA. The %tid special register contains a 1D, 2D, or 3D vector to match the CTA shape; the\n\n%tid value in unused dimensions is 0. The fourth element is unused and always returns\n\nzero. The number of threads in each dimension are specified by the predefined special register\n\n%ntid.\n\nEvery thread in the CTA has a unique %tid.\n\n%tid component values range from 0 through %ntid-1 in each CTA dimension.\n\n%tid.y == %tid.z == 0 in 1D CTAs. %tid.z == 0 in 2D CTAs.\n\nIt is guaranteed that:\n\n0 <= %tid.x < %ntid.x\n\n0 <= %tid.y < %ntid.y\n\n0 <= %tid.z < %ntid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%tid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32 %r1,%tid.x; // move tid.x to %rh\n\n// legacy code accessing 16-bit components of %tid\n\nmov.u16 %rh,%tid.x;\n\ncvt.u32.u16 %r2,%tid.z; // zero-extend tid.z to %r2\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-tid" }; case "tld4": return { "html": "For more information, visit tld4 .

Texture Instructions: tld4

\n\n\n

Perform a texture fetch of the 4-texel bilerp footprint.

Syntax

tld4.comp.2d.v4.dtype.f32    d[|p], [a, c] {, e} {, f};\ntld4.comp.geom.v4.dtype.f32  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\n.comp  = { .r, .g, .b, .a };\n.geom  = { .2d, .a2d, .cube, .acube };\n.dtype = { .u32, .s32, .f32 };\n

Description

Texture fetch of the 4-texel bilerp footprint using a texture coordinate vector. The instruction\nloads the bilerp footprint from the texture named by operand a at coordinates given by operand\nc into vector destination d. The texture component fetched for each texel sample is\nspecified by .comp. The four texel samples are placed into destination vector d in\ncounter-clockwise order starting at lower left.

An optional texture sampler b may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.

An optional operand f may be specified for depth textures. Depth textures are special type of\ntextures which hold data from the depth buffer. Depth buffer contains depth information of each\npixel. Operand f is .f32 scalar value that specifies depth compare value for depth\ntextures. Each element fetched from texture is compared against value given in f operand. If\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\nused for the filtering.

tld4.2d

For 2D textures, operand c specifies coordinates as a two-element, 32-bit floating-point vector.

An optional operand e may be specified. Operand e is a vector of type .v2.s32 that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\nvalue is in the range of -8 to +7.

tld4.a2d

Texture array selection, followed by tld4 texture fetch of 2d texture. For 2d texture arrays\noperand c is a four element, 32-bit vector. The first element in operand c is interpreted as an\nunsigned integer index (.u32) into the texture array, and the next two elements are interpreted\nas 32-bit floating point coordinates of 2d texture. The fourth element is ignored.

tld4.cube

For cubemap textures, operand c specifies four-element vector which comprises three\nfloating-point coordinates (s, t, r) and a fourth padding argument which is ignored.

Cubemap textures are special two-dimensional layered textures consisting of six layers that\nrepresent the faces of a cube. All layers in a cubemap are of the same size and are square (i.e.,\nwidth equals height).

Coordinates (s, t, r) are projected onto one of the six cube faces. The (s, t, r) coordinates can be\nthought of as a direction vector emanating from the center of the cube. Of the three coordinates (s,\nt, r), the coordinate of the largest magnitude (the major axis) selects the cube face. Then, the\nother two coordinates (the minor axes) are divided by the absolute value of the major axis to\nproduce a new (s, t) coordinate pair to lookup into the selected cube face.

Offset vector operand e is not supported for cubemap textures.

tld4.acube

Cubemap array selection, followed by tld4 texture fetch of cubemap texture. The first element in\noperand c is interpreted as an unsigned integer index (.u32) into the cubemap texture array,\nand the remaining three elements are interpreted as floating-point cubemap coordinates (s, t, r),\nused to lookup in the selected cubemap.

Offset vector operand e is not supported for cubemap texture arrays.

Indirect texture access

PTX ISA Notes

Introduced in PTX ISA version 2.2.

Indirect texture access introduced in PTX ISA version 3.1.

tld4.{a2d,cube,acube} introduced in PTX ISA version 4.3.

Offset vector operand introduced in PTX ISA version 4.3.

Depth compare operand introduced in PTX ISA version 4.3.

Support for optional destination predicate introduced in PTX ISA version 7.1.

Target ISA Notes

tld4 requires sm_20 or higher.

Indirect texture access requires sm_20 or higher.

tld4.{a2d,cube,acube} requires sm_30 or higher.

Offset vector operand requires sm_30 or higher.

Depth compare operand requires sm_30 or higher.

Support for optional destination predicate requires sm_60 or higher.

Examples

//Example of unified mode texturing\ntld4.r.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}];\n\n// Example of independent mode texturing\ntld4.r.2d.v4.u32.f32  {u1,u2,u3,u4}, [tex_a,smpl_x,{f1,f2}];\n\n// Example of unified mode texturing using offset\ntld4.r.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}], {r5, r6};\n\n// Example of unified mode texturing using compare\ntld4.r.2d.v4.f32.f32  {f1,f2,f3,f4}, [tex_a,{f5,f6}], f7;\n\n// Example of optional destination predicate\ntld4.r.2d.v4.f32.f32 {f1,f2,f3,f4}|p, [tex_a,{f5,f6}], f7;\n

", "tooltip": "Perform a texture fetch of the 4-texel bilerp footprint.\n\nSyntax\n\ntld4.comp.2d.v4.dtype.f32 d[|p], [a, c] {, e} {, f};\n\ntld4.comp.geom.v4.dtype.f32 d[|p], [a, b, c] {, e} {, f}; // explicit sampler\n\n.comp = { .r, .g, .b, .a };\n\n.geom = { .2d, .a2d, .cube, .acube };\n\n.dtype = { .u32, .s32, .f32 };\n\nDescription\n\nTexture fetch of the 4-texel bilerp footprint using a texture coordinate vector. The instruction\n\nloads the bilerp footprint from the texture named by operand a at coordinates given by operand\n\nc into vector destination d. The texture component fetched for each texel sample is\n\nspecified by .comp. The four texel samples are placed into destination vector d in\n\ncounter-clockwise order starting at lower left.\n\nAn optional texture sampler b may be specified. If no sampler is specified, the sampler behavior\n\nis a property of the named texture.\n\nThe optional destination predicate p is set to True if data from texture at specified\n\ncoordinates is resident in memory, False otherwise. When optional destination predicate p is\n\nset to False, data loaded will be all zeros. Memory residency of Texture Data at specified\n\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\n\nlaunch. Refer to Driver API documentation for more details including any system/implementation\n\nspecific behavior.\n\nAn optional operand f may be specified for depth textures. Depth textures are special type of\n\ntextures which hold data from the depth buffer. Depth buffer contains depth information of each\n\npixel. Operand f is .f32 scalar value that specifies depth compare value for depth\n\ntextures. Each element fetched from texture is compared against value given in f operand. If\n\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\n\nused for the filtering.\n\nA texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\ntld4.2d\n\nFor 2D textures, operand c specifies coordinates as a two-element, 32-bit floating-point vector.\n\nAn optional operand e may be specified. Operand e is a vector of type .v2.s32 that\n\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\n\nvalue is in the range of -8 to +7.\n\ntld4.a2d\n\nTexture array selection, followed by tld4 texture fetch of 2d texture. For 2d texture arrays\n\noperand c is a four element, 32-bit vector. The first element in operand c is interpreted as an\n\nunsigned integer index (.u32) into the texture array, and the next two elements are interpreted\n\nas 32-bit floating point coordinates of 2d texture. The fourth element is ignored.\n\nAn optional operand e may be specified. Operand e is a vector of type .v2.s32 that\n\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\n\nvalue is in the range of -8 to +7.\n\ntld4.cube\n\nFor cubemap textures, operand c specifies four-element vector which comprises three\n\nfloating-point coordinates (s, t, r) and a fourth padding argument which is ignored.\n\nCubemap textures are special two-dimensional layered textures consisting of six layers that\n\nrepresent the faces of a cube. All layers in a cubemap are of the same size and are square (i.e.,\n\nwidth equals height).\n\nCoordinates (s, t, r) are projected onto one of the six cube faces. The (s, t, r) coordinates can be\n\nthought of as a direction vector emanating from the center of the cube. Of the three coordinates (s,\n\nt, r), the coordinate of the largest magnitude (the major axis) selects the cube face. Then, the\n\nother two coordinates (the minor axes) are divided by the absolute value of the major axis to\n\nproduce a new (s, t) coordinate pair to lookup into the selected cube face.\n\nOffset vector opera ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4" }; case "total_smem_size": return { "html": "For more information, visit total_smem_size .

Special Registers: %total_smem_size

\n\n\n

Total size of shared memory used by a CTA of a kernel.

Syntax (predefined)

.sreg .u32 %total_smem_size;\n

Description

A predefined, read-only special register initialized with total size of shared memory allocated\n(statically and dynamically, excluding the shared memory reserved for the NVIDIA system software\nuse) for the CTA of a kernel at launch time.

Size is returned in multiples of shared memory allocation unit size supported by target\narchitecture.

Allocation unit values are as follows:

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Target architecture	Shared memory allocation unit size
`sm_2x`	128 bytes
`sm_3x`, `sm_5x`, `sm_6x`, `sm_7x`	256 bytes
`sm_8x`, `sm_9x`	128 bytes

PTX ISA Notes

Introduced in PTX ISA version 4.1.

Target ISA Notes

Requires sm_20 or higher.

Examples

mov.u32  %r, %total_smem_size;\n

", "tooltip": "Total size of shared memory used by a CTA of a kernel.\n\nSyntax (predefined)\n\n.sreg .u32 %total_smem_size;\n\nDescription\n\nA predefined, read-only special register initialized with total size of shared memory allocated\n\n(statically and dynamically, excluding the shared memory reserved for the NVIDIA system software\n\nuse) for the CTA of a kernel at launch time.\n\nSize is returned in multiples of shared memory allocation unit size supported by target\n\narchitecture.\n\nAllocation unit values are as follows:\n\n\n\nTarget architecture\n\nShared memory allocation unit size\n\nsm_2x\n\n128 bytes\n\nsm_3x, sm_5x, sm_6x, sm_7x\n\n256 bytes\n\nsm_8x, sm_9x\n\n128 bytes\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\nmov.u32 %r, %total_smem_size;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-total-smem-size" }; case "trap": return { "html": "For more information, visit trap .

Miscellaneous Instructions: trap

\n\n\n

Perform trap operation.

Syntax

trap;\n

Description

Abort execution and generate an interrupt to the host CPU.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

    trap;\n@p  trap;\n

", "tooltip": "Perform trap operation.\n\nSyntax\n\ntrap;\n\nDescription\n\nAbort execution and generate an interrupt to the host CPU.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n trap;\n\n@p trap;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap" }; case "txq": return { "html": "For more information, visit txq .

Texture Instructions: txq

\n\n\n

Query texture and sampler attributes.

Syntax

txq.tquery.b32         d, [a];       // texture attributes\ntxq.level.tlquery.b32  d, [a], lod;  // texture attributes\ntxq.squery.b32         d, [a];       // sampler attributes\n\n.tquery  = { .width, .height, .depth,\n             .channel_data_type, .channel_order,\n             .normalized_coords, .array_size,\n             .num_mipmap_levels, .num_samples};\n\n.tlquery = { .width, .height, .depth };\n\n.squery  = { .force_unnormalized_coords, .filter_mode,\n             .addr_mode_0, addr_mode_1, addr_mode_2 };\n

Description

Query an attribute of a texture or sampler. Operand a is either a .texref or .samplerref variable, or a .u64 register.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Query	Returns
`.width` \n `.height` \n `.depth` \n	value in elements
`.channel_data_type`	Unsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both `channel_data_type` and channel_order queries.
`.channel_order`	Unsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both `channel_data_type` and `channel_order` queries.
`.normalized_coords`	`1` (`True`) or `0` (`False`).
`.force_unnormalized_coords`	`1` (`True)` or `0` (`False).` Defined only for `.samplerref` variables in independent texture mode. Overrides the `normalized_coords` field of a `.texref` variable used with a `.samplerref` in a `tex` instruction.
`.filter_mode`	Integer from `enum { nearest, linear }`
`.addr_mode_0` \n `.addr_mode_1` \n `.addr_mode_2` \n	Integer from `enum { wrap, mirror, clamp_ogl, clamp_to_edge, clamp_to_border }`
`.array_size`	For a texture array, number of textures in array, 0 otherwise.
`.num_mipmap_levels`	For a mipmapped texture, number of levels of details (LOD), 0 otherwise.
`.num_samples`	For a multi-sample texture, number of samples, 0 otherwise.

Texture attributes are queried by supplying a .texref argument to txq. In unified mode,\nsampler attributes are also accessed via a .texref argument, and in independent mode sampler\nattributes are accessed via a separate .samplerref argument.

txq.level

txq.level requires an additional 32bit integer argument, lod, which specifies LOD and\nqueries requested attribute for the specified LOD.

Indirect texture access

PTX ISA Notes

Introduced in PTX ISA version 1.5.

Channel data type and channel order queries were added in PTX ISA version 2.1.

The .force_unnormalized_coords query was added in PTX ISA version 2.2.

Indirect texture access introduced in PTX ISA version 3.1.

.array_size, .num_mipmap_levels, .num_samples samples queries were added in PTX ISA\nversion 4.1.

txq.level introduced in PTX ISA version 4.3.

Target ISA Notes

Supported on all target architectures.

Indirect texture access requires sm_20 or higher.

Querying the number of mipmap levels requires sm_20 or higher.

Querying the number of samples requires sm_30 or higher.

txq.level requires sm_30 or higher.

Examples

txq.width.b32       %r1, [tex_A];\ntxq.filter_mode.b32 %r1, [tex_A];   // unified mode\ntxq.addr_mode_0.b32 %r1, [smpl_B];  // independent mode\ntxq.level.width.b32 %r1, [tex_A], %r_lod;\n

", "tooltip": "Query texture and sampler attributes.\n\nSyntax\n\ntxq.tquery.b32 d, [a]; // texture attributes\n\ntxq.level.tlquery.b32 d, [a], lod; // texture attributes\n\ntxq.squery.b32 d, [a]; // sampler attributes\n\n.tquery = { .width, .height, .depth,\n\n .channel_data_type, .channel_order,\n\n .normalized_coords, .array_size,\n\n .num_mipmap_levels, .num_samples};\n\n.tlquery = { .width, .height, .depth };\n\n.squery = { .force_unnormalized_coords, .filter_mode,\n\n .addr_mode_0, addr_mode_1, addr_mode_2 };\n\nDescription\n\nQuery an attribute of a texture or sampler. Operand a is either a .texref or .samplerref variable, or a .u64 register.\n\n\n\nQuery\n\nReturns\n\n.width\n\n.height\n\n.depth\n\nvalue in elements\n\n.channel_data_type\n\nUnsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.channel_order\n\nUnsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.normalized_coords\n\n1 (True) or 0 (False).\n\n.force_unnormalized_coords\n\n1 (True) or 0 (False). Defined only for .samplerref variables in independent texture mode. Overrides the normalized_coords field of a .texref variable used with a .samplerref in a tex instruction.\n\n.filter_mode\n\nInteger from enum { nearest, linear }\n\n.addr_mode_0\n\n.addr_mode_1\n\n.addr_mode_2\n\nInteger from enum { wrap, mirror, clamp_ogl, clamp_to_edge, clamp_to_border }\n\n.array_size\n\nFor a texture array, number of textures in array, 0 otherwise.\n\n.num_mipmap_levels\n\nFor a mipmapped texture, number of levels of details (LOD), 0 otherwise.\n\n.num_samples\n\nFor a multi-sample texture, number of samples, 0 otherwise.\n\nTexture attributes are queried by supplying a .texref argument to txq. In unified mode,\n\nsampler attributes are also accessed via a .texref argument, and in independent mode sampler\n\nattributes are accessed via a separate .samplerref argument.\n\ntxq.level\n\ntxq.level requires an additional 32bit integer argument, lod, which specifies LOD and\n\nqueries requested attribute for the specified LOD.\n\nIndirect texture access\n\nBeginning with PTX ISA version 3.1, indirect texture access is supported in unified mode for target\n\narchitecture sm_20 or higher. In indirect access, operand a is a .u64 register holding\n\nthe address of a .texref variable.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.5.\n\nChannel data type and channel order queries were added in PTX ISA version 2.1.\n\nThe .force_unnormalized_coords query was added in PTX ISA version 2.2.\n\nIndirect texture access introduced in PTX ISA version 3.1.\n\n.array_size, .num_mipmap_levels, .num_samples samples queries were added in PTX ISA\n\nversion 4.1.\n\ntxq.level introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nIndirect texture access requires sm_20 or higher.\n\nQuerying the number of mipmap levels requires sm_20 or higher.\n\nQuerying the number of samples requires sm_30 or higher.\n\ntxq.level requires sm_30 or higher.\n\nExamples\n\ntxq.width.b32 %r1, [tex_A];\n\ntxq.filter_mode.b32 %r1, [tex_A]; // unified mode\n\ntxq.addr_mode_0.b32 %r1, [smpl_B]; // independent mode\n\ntxq.level.width.b32 %r1, [tex_A], %r_lod;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq" }; case "vabsdiff": return { "html": "For more information, visit vabsdiff .

Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax

\n\n\n

Integer byte/half-word/word addition/subtraction.

vabsdiff

Integer byte/half-word/word absolute value of difference.

vmin, vmax

Integer byte/half-word/word minimum/maximum.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.

Examples

vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n

", "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat} d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat} d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n case vadd: tmp = ta + tb;\n\n case vsub: tmp = ta - tb;\n\n case vabsdiff: tmp = | ta - tb |;\n\n case vmin: tmp = MIN( ta, tb );\n\n case vmax: tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add r1, r2, r3, c;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax" }; case "vabsdiff2": return { "html": "For more information, visit vabsdiff2 .

SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2

\n\n\n

Integer dual half-word SIMD addition/subtraction.

vavrg2

Integer dual half-word SIMD average.

vabsdiff2

Integer dual half-word SIMD absolute value of difference.

vmin2, vmax2

Integer dual half-word SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel arithmetic operation with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand c\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.

Examples

vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n

", "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2 = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n switch ( vop2 ) {\n\n case vadd2: t[i] = Va[i] + Vb[i];\n\n case vsub2: t[i] = Va[i] - Vb[i];\n\n case vavrg2: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff2: t[i] = | Va[i] - Vb[i] |;\n\n case vmin2: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax2: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n else t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add r1.h10, r2.h00, r3.h22, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2" }; case "vabsdiff4": return { "html": "For more information, visit vabsdiff4 .

SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4

\n\n\n

Integer quad byte SIMD addition/subtraction.

vavrg4

Integer quad byte SIMD average.

vabsdiff4

Integer quad byte SIMD absolute value of difference.

vmin4, vmax4

Integer quad byte SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel arithmetic operation with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand c is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.

Examples

vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n

", "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4 = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n switch ( vop4 ) {\n\n case vadd4: t[i] = Va[i] + Vb[i];\n\n case vsub4: t[i] = Va[i] - Vb[i];\n\n case vavrg4: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff4: t[i] = | Va[i] - Vb[i] |;\n\n case vmin4: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax4: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n else t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add r1.b00, r2.b0000, r3.b2222, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4" }; case "vadd": return { "html": "For more information, visit vadd .

Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax

\n\n\n

Integer byte/half-word/word addition/subtraction.

vabsdiff

Integer byte/half-word/word absolute value of difference.

vmin, vmax

Integer byte/half-word/word minimum/maximum.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.

Examples

vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n

", "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat} d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat} d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n case vadd: tmp = ta + tb;\n\n case vsub: tmp = ta - tb;\n\n case vabsdiff: tmp = | ta - tb |;\n\n case vmin: tmp = MIN( ta, tb );\n\n case vmax: tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add r1, r2, r3, c;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax" }; case "vadd2": return { "html": "For more information, visit vadd2 .

SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2

\n\n\n

Integer dual half-word SIMD addition/subtraction.

vavrg2

Integer dual half-word SIMD average.

vabsdiff2

Integer dual half-word SIMD absolute value of difference.

vmin2, vmax2

Integer dual half-word SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel arithmetic operation with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand c\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.

Examples

vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n

", "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2 = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n switch ( vop2 ) {\n\n case vadd2: t[i] = Va[i] + Vb[i];\n\n case vsub2: t[i] = Va[i] - Vb[i];\n\n case vavrg2: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff2: t[i] = | Va[i] - Vb[i] |;\n\n case vmin2: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax2: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n else t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add r1.h10, r2.h00, r3.h22, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2" }; case "vadd4": return { "html": "For more information, visit vadd4 .

SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4

\n\n\n

Integer quad byte SIMD addition/subtraction.

vavrg4

Integer quad byte SIMD average.

vabsdiff4

Integer quad byte SIMD absolute value of difference.

vmin4, vmax4

Integer quad byte SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel arithmetic operation with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand c is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.

Examples

vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n

", "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4 = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n switch ( vop4 ) {\n\n case vadd4: t[i] = Va[i] + Vb[i];\n\n case vsub4: t[i] = Va[i] - Vb[i];\n\n case vavrg4: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff4: t[i] = | Va[i] - Vb[i] |;\n\n case vmin4: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax4: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n else t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add r1.b00, r2.b0000, r3.b2222, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4" }; case "vavrg2": return { "html": "For more information, visit vavrg2 .

SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2

\n\n\n

Integer dual half-word SIMD addition/subtraction.

vavrg2

Integer dual half-word SIMD average.

vabsdiff2

Integer dual half-word SIMD absolute value of difference.

vmin2, vmax2

Integer dual half-word SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel arithmetic operation with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand c\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.

Examples

vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n

", "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2 = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n switch ( vop2 ) {\n\n case vadd2: t[i] = Va[i] + Vb[i];\n\n case vsub2: t[i] = Va[i] - Vb[i];\n\n case vavrg2: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff2: t[i] = | Va[i] - Vb[i] |;\n\n case vmin2: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax2: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n else t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add r1.h10, r2.h00, r3.h22, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2" }; case "vavrg4": return { "html": "For more information, visit vavrg4 .

SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4

\n\n\n

Integer quad byte SIMD addition/subtraction.

vavrg4

Integer quad byte SIMD average.

vabsdiff4

Integer quad byte SIMD absolute value of difference.

vmin4, vmax4

Integer quad byte SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel arithmetic operation with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand c is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.

Examples

vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n

", "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4 = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n switch ( vop4 ) {\n\n case vadd4: t[i] = Va[i] + Vb[i];\n\n case vsub4: t[i] = Va[i] - Vb[i];\n\n case vavrg4: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff4: t[i] = | Va[i] - Vb[i] |;\n\n case vmin4: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax4: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n else t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add r1.b00, r2.b0000, r3.b2222, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4" }; case "version": return { "html": "For more information, visit version .

PTX Module Directives: .version

\n\n\n

PTX ISA version number.

Syntax

.version  major.minor    // major, minor are integers\n

Description

Specifies the PTX language version number.

The major number is incremented when there are incompatible changes to the PTX language, such as\nchanges to the syntax or semantics. The version major number is used by the PTX compiler to ensure\ncorrect execution of legacy PTX code.

The minor number is incremented when new features are added to PTX.

Semantics

Indicates that this module must be compiled with tools that support an equal or greater version\nnumber.

Each PTX module must begin with a .version directive, and no other .version directive is\nallowed anywhere else within the module.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

.version 3.1\n.version 3.0\n.version 2.3\n

", "tooltip": "PTX ISA version number.\n\nSyntax\n\n.version major.minor // major, minor are integers\n\nDescription\n\nSpecifies the PTX language version number.\n\nThe major number is incremented when there are incompatible changes to the PTX language, such as\n\nchanges to the syntax or semantics. The version major number is used by the PTX compiler to ensure\n\ncorrect execution of legacy PTX code.\n\nThe minor number is incremented when new features are added to PTX.\n\nSemantics\n\nIndicates that this module must be compiled with tools that support an equal or greater version\n\nnumber.\n\nEach PTX module must begin with a .version directive, and no other .version directive is\n\nallowed anywhere else within the module.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.version 3.1\n\n.version 3.0\n\n.version 2.3\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-version" }; case "visible": return { "html": "For more information, visit visible .

Linking Directives: .visible

\n\n\n

Visible (externally) symbol declaration.

Syntax

.visible identifier\n

Description

Declares identifier to be globally visible. Unlike C, where identifiers are globally visible unless\ndeclared static, PTX identifiers are visible only within the current module unless declared\n.visible outside the current.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

.visible .global .b32 foo;  // foo will be externally visible\n

", "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.visible identifier\n\nDescription\n\nDeclares identifier to be globally visible. Unlike C, where identifiers are globally visible unless\n\ndeclared static, PTX identifiers are visible only within the current module unless declared\n\n.visible outside the current.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.visible .global .b32 foo; // foo will be externally visible\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-visible" }; case "vmad": return { "html": "For more information, visit vmad .

Scalar Video Instructions: vmad

\n\n\n

Integer byte/half-word/word multiply-accumulate.

Syntax

// 32-bit scalar operation\nvmad.dtype.atype.btype{.sat}{.scale}     d, {-}a{.asel}, {-}b{.bsel},\n                                         {-}c;\nvmad.dtype.atype.btype.po{.sat}{.scale}  d, a{.asel}, b{.bsel}, c;\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.scale = { .shr7, .shr15 };\n

Description

Calculate (a*b) + c, with optional operand negates, plus one mode, and scaling.

The source operands support optional negation with some restrictions. Although PTX syntax allows\nseparate negation of the a and b operands, internally this is represented as negation of the\nproduct (a*b). That is, (a*b) is negated if and only if exactly one of a or b is\nnegated. PTX allows negation of either (a*b) or c.

The plus one mode (.po) computes (a*b) + c + 1, which is used in computing averages. Source\noperands may not be negated in .po mode.

The intermediate result of (a*b) is unsigned if atype and btype are unsigned and the product\n(a*b) is not negated; otherwise, the intermediate result is signed. Input c has the same\nsign as the intermediate result.

The final result is unsigned if the intermediate result is unsigned and c is not negated.

Depending on the sign of the a and b operands, and the operand negates, the following\ncombinations of operands are supported for VMAD:

 (u32 * u32) + u32  // intermediate unsigned; final unsigned\n-(u32 * u32) + s32  // intermediate   signed; final   signed\n (u32 * u32) - u32  // intermediate unsigned; final   signed\n (u32 * s32) + s32  // intermediate   signed; final   signed\n-(u32 * s32) + s32  // intermediate   signed; final   signed\n (u32 * s32) - s32  // intermediate   signed; final   signed\n (s32 * u32) + s32  // intermediate   signed; final   signed\n-(s32 * u32) + s32  // intermediate   signed; final   signed\n (s32 * u32) - s32  // intermediate   signed; final   signed\n (s32 * s32) + s32  // intermediate   signed; final   signed\n-(s32 * s32) + s32  // intermediate   signed; final   signed\n (s32 * s32) - s32  // intermediate   signed; final   signed\n

The intermediate result is optionally scaled via right-shift; this result is sign-extended if the\nfinal result is signed, and zero-extended otherwise.

The final result is optionally saturated to the appropriate 32-bit range based on the type (signed\nor unsigned) of the final result.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\nsignedFinal = isSigned(atype) || isSigned(btype) ||\n                                 (a.negate ^ b.negate) || c.negate;\ntmp[127:0] = ta * tb;\n\nlsb = 0;\nif ( .po )                  {              lsb = 1; } else\nif ( a.negate ^ b.negate )  { tmp = ~tmp;  lsb = 1; } else\nif ( c.negate )             { c   = ~c;    lsb = 1; }\n\nc128[127:0] = (signedFinal) sext32( c ) : zext ( c );\ntmp = tmp + c128 + lsb;\nswitch( scale ) {\n   case .shr7:   result = (tmp >>  7) & 0xffffffffffffffff;\n   case .shr15:  result = (tmp >> 15) & 0xffffffffffffffff;\n}\nif ( .sat ) {\n     if (signedFinal) result = CLAMP(result, S32_MAX, S32_MIN);\n     else             result = CLAMP(result, U32_MAX, U32_MIN);\n}\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vmad requires sm_20 or higher.

Examples

vmad.s32.s32.u32.sat    r0, r1, r2, -r3;\nvmad.u32.u32.u32.shr15  r0, r1.h0, r2.h0, r3;\n

", "tooltip": "Integer byte/half-word/word multiply-accumulate.\n\nSyntax\n\n// 32-bit scalar operation\n\nvmad.dtype.atype.btype{.sat}{.scale} d, {-}a{.asel}, {-}b{.bsel},\n\n {-}c;\n\nvmad.dtype.atype.btype.po{.sat}{.scale} d, a{.asel}, b{.bsel}, c;\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.scale = { .shr7, .shr15 };\n\nDescription\n\nCalculate (a*b) + c, with optional operand negates, plus one mode, and scaling.\n\nThe source operands support optional negation with some restrictions. Although PTX syntax allows\n\nseparate negation of the a and b operands, internally this is represented as negation of the\n\nproduct (a*b). That is, (a*b) is negated if and only if exactly one of a or b is\n\nnegated. PTX allows negation of either (a*b) or c.\n\nThe plus one mode (.po) computes (a*b) + c + 1, which is used in computing averages. Source\n\noperands may not be negated in .po mode.\n\nThe intermediate result of (a*b) is unsigned if atype and btype are unsigned and the product\n\n(a*b) is not negated; otherwise, the intermediate result is signed. Input c has the same\n\nsign as the intermediate result.\n\nThe final result is unsigned if the intermediate result is unsigned and c is not negated.\n\nDepending on the sign of the a and b operands, and the operand negates, the following\n\ncombinations of operands are supported for VMAD:\n\n (u32 * u32) + u32 // intermediate unsigned; final unsigned\n\n-(u32 * u32) + s32 // intermediate signed; final signed\n\n (u32 * u32) - u32 // intermediate unsigned; final signed\n\n (u32 * s32) + s32 // intermediate signed; final signed\n\n-(u32 * s32) + s32 // intermediate signed; final signed\n\n (u32 * s32) - s32 // intermediate signed; final signed\n\n (s32 * u32) + s32 // intermediate signed; final signed\n\n-(s32 * u32) + s32 // intermediate signed; final signed\n\n (s32 * u32) - s32 // intermediate signed; final signed\n\n (s32 * s32) + s32 // intermediate signed; final signed\n\n-(s32 * s32) + s32 // intermediate signed; final signed\n\n (s32 * s32) - s32 // intermediate signed; final signed\n\nThe intermediate result is optionally scaled via right-shift; this result is sign-extended if the\n\nfinal result is signed, and zero-extended otherwise.\n\nThe final result is optionally saturated to the appropriate 32-bit range based on the type (signed\n\nor unsigned) of the final result.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nsignedFinal = isSigned(atype) || isSigned(btype) ||\n\n (a.negate ^ b.negate) || c.negate;\n\ntmp[127:0] = ta * tb;\n\nlsb = 0;\n\nif ( .po ) { lsb = 1; } else\n\nif ( a.negate ^ b.negate ) { tmp = ~tmp; lsb = 1; } else\n\nif ( c.negate ) { c = ~c; lsb = 1; }\n\nc128[127:0] = (signedFinal) sext32( c ) : zext ( c );\n\ntmp = tmp + c128 + lsb;\n\nswitch( scale ) {\n\n case .shr7: result = (tmp >> 7) & 0xffffffffffffffff;\n\n case .shr15: result = (tmp >> 15) & 0xffffffffffffffff;\n\n}\n\nif ( .sat ) {\n\n if (signedFinal) result = CLAMP(result, S32_MAX, S32_MIN);\n\n else result = CLAMP(result, U32_MAX, U32_MIN);\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvmad requires sm_20 or higher.\n\nExamples\n\nvmad.s32.s32.u32.sat r0, r1, r2, -r3;\n\nvmad.u32.u32.u32.shr15 r0, r1.h0, r2.h0, r3;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad" }; case "vmax": return { "html": "For more information, visit vmax .

Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax

\n\n\n

Integer byte/half-word/word addition/subtraction.

vabsdiff

Integer byte/half-word/word absolute value of difference.

vmin, vmax

Integer byte/half-word/word minimum/maximum.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.

Examples

vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n

", "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat} d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat} d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n case vadd: tmp = ta + tb;\n\n case vsub: tmp = ta - tb;\n\n case vabsdiff: tmp = | ta - tb |;\n\n case vmin: tmp = MIN( ta, tb );\n\n case vmax: tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add r1, r2, r3, c;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax" }; case "vmax2": return { "html": "For more information, visit vmax2 .

SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2

\n\n\n

Integer dual half-word SIMD addition/subtraction.

vavrg2

Integer dual half-word SIMD average.

vabsdiff2

Integer dual half-word SIMD absolute value of difference.

vmin2, vmax2

Integer dual half-word SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel arithmetic operation with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand c\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.

Examples

vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n

", "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2 = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n switch ( vop2 ) {\n\n case vadd2: t[i] = Va[i] + Vb[i];\n\n case vsub2: t[i] = Va[i] - Vb[i];\n\n case vavrg2: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff2: t[i] = | Va[i] - Vb[i] |;\n\n case vmin2: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax2: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n else t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add r1.h10, r2.h00, r3.h22, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2" }; case "vmax4": return { "html": "For more information, visit vmax4 .

SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4

\n\n\n

Integer quad byte SIMD addition/subtraction.

vavrg4

Integer quad byte SIMD average.

vabsdiff4

Integer quad byte SIMD absolute value of difference.

vmin4, vmax4

Integer quad byte SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel arithmetic operation with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand c is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.

Examples

vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n

", "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4 = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n switch ( vop4 ) {\n\n case vadd4: t[i] = Va[i] + Vb[i];\n\n case vsub4: t[i] = Va[i] - Vb[i];\n\n case vavrg4: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff4: t[i] = | Va[i] - Vb[i] |;\n\n case vmin4: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax4: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n else t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add r1.b00, r2.b0000, r3.b2222, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4" }; case "vmin": return { "html": "For more information, visit vmin .

Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax

\n\n\n

Integer byte/half-word/word addition/subtraction.

vabsdiff

Integer byte/half-word/word absolute value of difference.

vmin, vmax

Integer byte/half-word/word minimum/maximum.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.

Examples

vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n

", "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat} d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat} d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n case vadd: tmp = ta + tb;\n\n case vsub: tmp = ta - tb;\n\n case vabsdiff: tmp = | ta - tb |;\n\n case vmin: tmp = MIN( ta, tb );\n\n case vmax: tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add r1, r2, r3, c;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax" }; case "vmin2": return { "html": "For more information, visit vmin2 .

SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2

\n\n\n

Integer dual half-word SIMD addition/subtraction.

vavrg2

Integer dual half-word SIMD average.

vabsdiff2

Integer dual half-word SIMD absolute value of difference.

vmin2, vmax2

Integer dual half-word SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel arithmetic operation with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand c\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.

Examples

vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n

", "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2 = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n switch ( vop2 ) {\n\n case vadd2: t[i] = Va[i] + Vb[i];\n\n case vsub2: t[i] = Va[i] - Vb[i];\n\n case vavrg2: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff2: t[i] = | Va[i] - Vb[i] |;\n\n case vmin2: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax2: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n else t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add r1.h10, r2.h00, r3.h22, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2" }; case "vmin4": return { "html": "For more information, visit vmin4 .

SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4

\n\n\n

Integer quad byte SIMD addition/subtraction.

vavrg4

Integer quad byte SIMD average.

vabsdiff4

Integer quad byte SIMD absolute value of difference.

vmin4, vmax4

Integer quad byte SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel arithmetic operation with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand c is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.

Examples

vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n

", "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4 = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n switch ( vop4 ) {\n\n case vadd4: t[i] = Va[i] + Vb[i];\n\n case vsub4: t[i] = Va[i] - Vb[i];\n\n case vavrg4: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff4: t[i] = | Va[i] - Vb[i] |;\n\n case vmin4: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax4: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n else t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add r1.b00, r2.b0000, r3.b2222, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4" }; case "vote": return { "html": "For more information, visit vote.sync .

Parallel Synchronization and Communication Instructions: vote.sync

\n\n\n

Vote across thread group.

Syntax

vote.sync.mode.pred  d, {!}a, membermask;\nvote.sync.ballot.b32 d, {!}a, membermask;  // 'ballot' form, returns bitmask\n\n.mode = { .all, .any, .uni };\n

Description

vote.sync will cause executing thread to wait until all non-exited threads corresponding to\nmembermask have executed vote.sync with the same qualifiers and same membermask value\nbefore resuming execution.

Operand membermask specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s laneid. Operand a is a\npredicate register.

In the mode form, vote.sync performs a reduction of the source predicate across all non-exited\nthreads in membermask. The destination operand d is a predicate register and its value is\nthe same across all threads in membermask.

The reduction modes are:

.all: True if source predicate is True for all non-exited threads in membermask. Negate the\nsource predicate to compute .none.
\n
.any: True if source predicate is True for some thread in membermask. Negate the source\npredicate to compute .not_all.
\n
.uni: True if source predicate has the same value in all non-exited threads in\nmembermask. Negating the source predicate also computes .uni.
\n

In the ballot form, the destination operand d is a .b32 register. In this form,\nvote.sync.ballot.b32 simply copies the predicate from each thread in membermask into the\ncorresponding bit position of destination register d, where the bit position corresponds to the\nthread\u2019s lane id.

A thread not specified in membermask will contribute a 0 for its entry in\nvote.sync.ballot.b32.

The behavior of vote.sync is undefined if the executing thread is not in the membermask.

Note

For .target sm_6x or below, all threads in membermask must execute the same vote.sync\ninstruction in convergence, and only threads belonging to some membermask can be active when\nthe vote.sync instruction is executed. Otherwise, the behavior is undefined.

PTX ISA Notes

Introduced in PTX ISA version 6.0.

Target ISA Notes

Requires sm_30 or higher.

Examples

vote.sync.all.pred    p,q,0xffffffff;\nvote.sync.ballot.b32  r1,p,0xffffffff;  // get 'ballot' across warp\n

", "tooltip": "Vote across thread group.\n\nSyntax\n\nvote.sync.mode.pred d, {!}a, membermask;\n\nvote.sync.ballot.b32 d, {!}a, membermask; // 'ballot' form, returns bitmask\n\n.mode = { .all, .any, .uni };\n\nDescription\n\nvote.sync will cause executing thread to wait until all non-exited threads corresponding to\n\nmembermask have executed vote.sync with the same qualifiers and same membermask value\n\nbefore resuming execution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid. Operand a is a\n\npredicate register.\n\nIn the mode form, vote.sync performs a reduction of the source predicate across all non-exited\n\nthreads in membermask. The destination operand d is a predicate register and its value is\n\nthe same across all threads in membermask.\n\nThe reduction modes are:\n\n.allTrue if source predicate is True for all non-exited threads in membermask. Negate the\n\nsource predicate to compute .none.\n\n.anyTrue if source predicate is True for some thread in membermask. Negate the source\n\npredicate to compute .not_all.\n\n.uniTrue if source predicate has the same value in all non-exited threads in\n\nmembermask. Negating the source predicate also computes .uni.\n\nIn the ballot form, the destination operand d is a .b32 register. In this form,\n\nvote.sync.ballot.b32 simply copies the predicate from each thread in membermask into the\n\ncorresponding bit position of destination register d, where the bit position corresponds to the\n\nthread\u2019s lane id.\n\nA thread not specified in membermask will contribute a 0 for its entry in\n\nvote.sync.ballot.b32.\n\nThe behavior of vote.sync is undefined if the executing thread is not in the membermask.\n\nNote\n\nFor .target sm_6x or below, all threads in membermask must execute the same vote.sync\n\ninstruction in convergence, and only threads belonging to some membermask can be active when\n\nthe vote.sync instruction is executed. Otherwise, the behavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nvote.sync.all.pred p,q,0xffffffff;\n\nvote.sync.ballot.b32 r1,p,0xffffffff; // get 'ballot' across warp\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync" }; case "vset": return { "html": "For more information, visit vset .

Scalar Video Instructions: vset

\n\n\n

Integer byte/half-word/word comparison.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvset.atype.btype.cmp       d, a{.asel}, b{.bsel};\nvset.atype.btype.cmp.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvset.atype.btype.cmp  d.dsel, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

Compare input values using specified comparison, with optional secondary arithmetic operation or\nsubword data merge.

The intermediate result of the comparison is always unsigned, and therefore destination d and\noperand c are also unsigned.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\ntmp = compare( ta, tb, cmp ) ? 1 : 0;\nd = optSecondaryOp( op2, tmp, c );    // optional secondary operation\nd = optMerge( dsel, tmp, c );         // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vset requires sm_20 or higher.

Examples

vset.s32.u32.lt    r1, r2, r3;\nvset.u32.u32.ne    r1, r2, r3.h1;\n

", "tooltip": "Integer byte/half-word/word comparison.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvset.atype.btype.cmp d, a{.asel}, b{.bsel};\n\nvset.atype.btype.cmp.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvset.atype.btype.cmp d.dsel, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.cmp = { .eq, .ne, .lt, .le, .gt, .ge };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nCompare input values using specified comparison, with optional secondary arithmetic operation or\n\nsubword data merge.\n\nThe intermediate result of the comparison is always unsigned, and therefore destination d and\n\noperand c are also unsigned.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\ntmp = compare( ta, tb, cmp ) ? 1 : 0;\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvset requires sm_20 or higher.\n\nExamples\n\nvset.s32.u32.lt r1, r2, r3;\n\nvset.u32.u32.ne r1, r2, r3.h1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset" }; case "vset2": return { "html": "For more information, visit vset2 .

SIMD Video Instructions: vset2

\n\n\n

Integer dual half-word SIMD comparison.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvset2.atype.btype.cmp  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvset2.atype.btype.cmp.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel comparison with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then compared in parallel.

The intermediate result of the comparison is always unsigned, and therefore the half-words of\ndestination d and operand c are also unsigned.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand b\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\nfor (i=0; i<2; i++) {\n    t[i] = compare( Va[i], Vb[i], .cmp ) ? 1 : 0;\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vset2 requires sm_30 or higher.

Examples

vset2.s32.u32.lt      r1, r2, r3, r0;\nvset2.u32.u32.ne.add  r1, r2, r3, r0;\n

", "tooltip": "Integer dual half-word SIMD comparison.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvset2.atype.btype.cmp d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvset2.atype.btype.cmp.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.cmp = { .eq, .ne, .lt, .le, .gt, .ge };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel comparison with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then compared in parallel.\n\nThe intermediate result of the comparison is always unsigned, and therefore the half-words of\n\ndestination d and operand c are also unsigned.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand b\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n t[i] = compare( Va[i], Vb[i], .cmp ) ? 1 : 0;\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvset2 requires sm_30 or higher.\n\nExamples\n\nvset2.s32.u32.lt r1, r2, r3, r0;\n\nvset2.u32.u32.ne.add r1, r2, r3, r0;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2" }; case "vset4": return { "html": "For more information, visit vset4 .

SIMD Video Instructions: vset4

\n\n\n

Integer quad byte SIMD comparison.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvset4.atype.btype.cmp  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvset4.atype.btype.cmp.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel comparison with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then compared in parallel.

The intermediate result of the comparison is always unsigned, and therefore the bytes of destination\nd and operand c are also unsigned.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand b is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    t[i] = compare( Va[i], Vb[i], cmp ) ? 1 : 0;\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vset4 requires sm_30 or higher.

Examples

vset4.s32.u32.lt      r1, r2, r3, r0;\nvset4.u32.u32.ne.max  r1, r2, r3, r0;\n

", "tooltip": "Integer quad byte SIMD comparison.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvset4.atype.btype.cmp d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvset4.atype.btype.cmp.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.cmp = { .eq, .ne, .lt, .le, .gt, .ge };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel comparison with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then compared in parallel.\n\nThe intermediate result of the comparison is always unsigned, and therefore the bytes of destination\n\nd and operand c are also unsigned.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand b is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n t[i] = compare( Va[i], Vb[i], cmp ) ? 1 : 0;\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvset4 requires sm_30 or higher.\n\nExamples\n\nvset4.s32.u32.lt r1, r2, r3, r0;\n\nvset4.u32.u32.ne.max r1, r2, r3, r0;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4" }; case "vshl": return { "html": "For more information, visit vshl .

Scalar Video Instructions: vshl, vshr

\n\n\n

Integer byte/half-word/word left/right shift.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.u32{.sat}.mode       d, a{.asel}, b{.bsel};\nvop.dtype.atype.u32{.sat}.mode.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.u32{.sat}.mode  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vshl, vshr };\n.dtype = .atype = { .u32, .s32 };\n.mode  = { .clamp, .wrap };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

vshl: Shift a left by unsigned amount in b with optional saturate, and optional secondary\narithmetic operation or subword data merge. Left shift fills with zero.
\n
vshr: Shift a right by unsigned amount in b with optional saturate, and optional secondary\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\nfills with zero.
\n

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a,atype, asel );\ntb = partSelectSignExtend( b, .u32, bsel );\nif ( mode == .clamp  && tb > 32 )  tb = 32;\nif ( mode == .wrap )                       tb = tb & 0x1f;\nswitch ( vop ){\n   case vshl:  tmp = ta << tb;\n   case vshr:  tmp = ta >> tb;\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vshl, vshr require sm_20 or higher.

Examples

vshl.s32.u32.u32.clamp  r1, r2, r3;\nvshr.u32.u32.u32.wrap   r1, r2, r3.h1;\n

", "tooltip": "Integer byte/half-word/word left/right shift.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.u32{.sat}.mode d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.u32{.sat}.mode.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.u32{.sat}.mode d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vshl, vshr };\n\n.dtype = .atype = { .u32, .s32 };\n\n.mode = { .clamp, .wrap };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nvshlShift a left by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Left shift fills with zero.\n\nvshrShift a right by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\n\nfills with zero.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a,atype, asel );\n\ntb = partSelectSignExtend( b, .u32, bsel );\n\nif ( mode == .clamp && tb > 32 ) tb = 32;\n\nif ( mode == .wrap ) tb = tb & 0x1f;\n\nswitch ( vop ){\n\n case vshl: tmp = ta << tb;\n\n case vshr: tmp = ta >> tb;\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvshl, vshr require sm_20 or higher.\n\nExamples\n\nvshl.s32.u32.u32.clamp r1, r2, r3;\n\nvshr.u32.u32.u32.wrap r1, r2, r3.h1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr" }; case "vshr": return { "html": "For more information, visit vshr .

Scalar Video Instructions: vshl, vshr

\n\n\n

Integer byte/half-word/word left/right shift.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.u32{.sat}.mode       d, a{.asel}, b{.bsel};\nvop.dtype.atype.u32{.sat}.mode.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.u32{.sat}.mode  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vshl, vshr };\n.dtype = .atype = { .u32, .s32 };\n.mode  = { .clamp, .wrap };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

vshl: Shift a left by unsigned amount in b with optional saturate, and optional secondary\narithmetic operation or subword data merge. Left shift fills with zero.
\n
vshr: Shift a right by unsigned amount in b with optional saturate, and optional secondary\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\nfills with zero.
\n

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a,atype, asel );\ntb = partSelectSignExtend( b, .u32, bsel );\nif ( mode == .clamp  && tb > 32 )  tb = 32;\nif ( mode == .wrap )                       tb = tb & 0x1f;\nswitch ( vop ){\n   case vshl:  tmp = ta << tb;\n   case vshr:  tmp = ta >> tb;\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vshl, vshr require sm_20 or higher.

Examples

vshl.s32.u32.u32.clamp  r1, r2, r3;\nvshr.u32.u32.u32.wrap   r1, r2, r3.h1;\n

", "tooltip": "Integer byte/half-word/word left/right shift.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.u32{.sat}.mode d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.u32{.sat}.mode.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.u32{.sat}.mode d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vshl, vshr };\n\n.dtype = .atype = { .u32, .s32 };\n\n.mode = { .clamp, .wrap };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nvshlShift a left by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Left shift fills with zero.\n\nvshrShift a right by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\n\nfills with zero.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a,atype, asel );\n\ntb = partSelectSignExtend( b, .u32, bsel );\n\nif ( mode == .clamp && tb > 32 ) tb = 32;\n\nif ( mode == .wrap ) tb = tb & 0x1f;\n\nswitch ( vop ){\n\n case vshl: tmp = ta << tb;\n\n case vshr: tmp = ta >> tb;\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvshl, vshr require sm_20 or higher.\n\nExamples\n\nvshl.s32.u32.u32.clamp r1, r2, r3;\n\nvshr.u32.u32.u32.wrap r1, r2, r3.h1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr" }; case "vsub": return { "html": "For more information, visit vsub .

Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax

\n\n\n

Integer byte/half-word/word addition/subtraction.

vabsdiff

Integer byte/half-word/word absolute value of difference.

vmin, vmax

Integer byte/half-word/word minimum/maximum.

Syntax

// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n

Description

Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.

Semantics

// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n

PTX ISA Notes

Introduced in PTX ISA version 2.0.

Target ISA Notes

vadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.

Examples

vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n

", "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat} d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2 d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat} d.dsel, a{.asel}, b{.bsel}, c;\n\n vop = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel = .asel = .bsel = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2 = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n case vadd: tmp = ta + tb;\n\n case vsub: tmp = ta - tb;\n\n case vabsdiff: tmp = | ta - tb |;\n\n case vmin: tmp = MIN( ta, tb );\n\n case vmax: tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c ); // optional secondary operation\n\nd = optMerge( dsel, tmp, c ); // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add r1, r2, r3, c;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax" }; case "vsub2": return { "html": "For more information, visit vsub2 .

SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2

\n\n\n

Integer dual half-word SIMD addition/subtraction.

vavrg2

Integer dual half-word SIMD average.

vabsdiff2

Integer dual half-word SIMD absolute value of difference.

vmin2, vmax2

Integer dual half-word SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n

Description

Two-way SIMD parallel arithmetic operation with secondary operation.

Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands a and b using the asel and bsel modifiers.

The selected half-words are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For half-word positions indicated in mask, the selected half-word results are copied into\ndestination d. For all other positions, the corresponding half-word from source operand c\nis copied to d.

For instructions with a secondary accumulate operation:

For half-word positions indicated in mask, the selected half-word results are added to operand\nc, producing a result in d.

Semantics

// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.

Examples

vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n

", "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2 = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .h0, .h1, .h10 }; // defaults to .h10\n\n.asel = .bsel = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n .asel defaults to .h10\n\n .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n switch ( vop2 ) {\n\n case vadd2: t[i] = Va[i] + Vb[i];\n\n case vsub2: t[i] = Va[i] - Vb[i];\n\n case vavrg2: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff2: t[i] = | Va[i] - Vb[i] |;\n\n case vmin2: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax2: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n else t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<2; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<2; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add r1.h10, r2.h00, r3.h22, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2" }; case "vsub4": return { "html": "For more information, visit vsub4 .

SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4

\n\n\n

Integer quad byte SIMD addition/subtraction.

vavrg4

Integer quad byte SIMD average.

vabsdiff4

Integer quad byte SIMD absolute value of difference.

vmin4, vmax4

Integer quad byte SIMD minimum/maximum.

Syntax

// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n

Description

Four-way SIMD parallel arithmetic operation with secondary operation.

Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands a and b using the asel and bsel modifiers.

The selected bytes are then operated on in parallel.

The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.

For instructions with a secondary SIMD merge operation:

For byte positions indicated in mask, the selected byte results are copied into destination\nd. For all other positions, the corresponding byte from source operand c is copied to\nd.

For instructions with a secondary accumulate operation:

For byte positions indicated in mask, the selected byte results are added to operand c,\nproducing a result in d.

Semantics

// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i<4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n

PTX ISA Notes

Introduced in PTX ISA version 3.0.

Target ISA Notes

vadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.

Examples

vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n

", "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat} d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4 = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask = { .b0,\n\n .b1, .b10\n\n .b2, .b20, .b21, .b210,\n\n .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n .asel defaults to .b3210\n\n .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n switch ( vop4 ) {\n\n case vadd4: t[i] = Va[i] + Vb[i];\n\n case vsub4: t[i] = Va[i] - Vb[i];\n\n case vavrg4: if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n } else {\n\n t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n }\n\n case vabsdiff4: t[i] = | Va[i] - Vb[i] |;\n\n case vmin4: t[i] = MIN( Va[i], Vb[i] );\n\n case vmax4: t[i] = MAX( Va[i], Vb[i] );\n\n }\n\n if (.sat) {\n\n if ( .dtype == .s32 ) t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n else t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n d = c;\n\n for (i=0; i<4; i++) { d += mask[i] ? t[i] : 0; }\n\n} else {\n\n d = 0;\n\n for (i=0; i<4; i++) { d |= mask[i] ? t[i] : Vc[i]; }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add r1.b00, r2.b0000, r3.b2222, r1;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4" }; case "warpid": return { "html": "For more information, visit warpid .

Special Registers: %warpid

\n\n\n

Warp identifier.

Syntax (predefined)

.sreg .u32 %warpid;\n

Description

A predefined, read-only special register that returns the thread\u2019s warp identifier. The warp\nidentifier provides a unique warp number within a CTA but not across CTAs within a grid. The warp\nidentifier will be the same for all threads within a single warp.

Note that %warpid is volatile and returns the location of a thread at the moment when read, but\nits value may change during execution, e.g., due to rescheduling of threads following\npreemption. For this reason, %ctaid and %tid should be used to compute a virtual warp index\nif such a value is needed in kernel code; %warpid is intended mainly to enable profiling and\ndiagnostic code to sample and log information such as work place mapping and load distribution.

PTX ISA Notes

Introduced in PTX ISA version 1.3.

Target ISA Notes

Supported on all target architectures.

Examples

mov.u32  %r, %warpid;\n

", "tooltip": "Warp identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %warpid;\n\nDescription\n\nA predefined, read-only special register that returns the thread\u2019s warp identifier. The warp\n\nidentifier provides a unique warp number within a CTA but not across CTAs within a grid. The warp\n\nidentifier will be the same for all threads within a single warp.\n\nNote that %warpid is volatile and returns the location of a thread at the moment when read, but\n\nits value may change during execution, e.g., due to rescheduling of threads following\n\npreemption. For this reason, %ctaid and %tid should be used to compute a virtual warp index\n\nif such a value is needed in kernel code; %warpid is intended mainly to enable profiling and\n\ndiagnostic code to sample and log information such as work place mapping and load distribution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32 %r, %warpid;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-warpid" }; case "weak": return { "html": "For more information, visit weak .

Linking Directives: .weak

\n\n\n

Visible (externally) symbol declaration.

Syntax

.weak identifier\n

Description

Declares identifier to be globally visible but weak. Weak symbols are similar to globally visible\nsymbols, except during linking, weak symbols are only chosen after globally visible symbols during\nsymbol resolution. Unlike globally visible symbols, multiple object files may declare the same weak\nsymbol, and references to a symbol get resolved against a weak symbol only if no global symbols have\nthe same name.

PTX ISA Notes

Introduced in PTX ISA version 3.1.

Target ISA Notes

Supported on all target architectures.

Examples

.weak .func (.reg .b32 val) foo;  // foo will be externally visible\n

", "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.weak identifier\n\nDescription\n\nDeclares identifier to be globally visible but weak. Weak symbols are similar to globally visible\n\nsymbols, except during linking, weak symbols are only chosen after globally visible symbols during\n\nsymbol resolution. Unlike globally visible symbols, multiple object files may declare the same weak\n\nsymbol, and references to a symbol get resolved against a weak symbol only if no global symbols have\n\nthe same name.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.weak .func (.reg .b32 val) foo; // foo will be externally visible\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-weak" }; case "xor": return { "html": "For more information, visit xor .

Logic and Shift Instructions: xor

\n\n\n

Bitwise exclusive-OR (inequality).

Syntax

xor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n

Description

Compute the bit-wise exclusive-or operation for the bits in a and b.

Semantics

d = a ^ b;\n

Notes

The size of the operands must match, but not necessarily the type.

Allowed types include predicate registers.

PTX ISA Notes

Introduced in PTX ISA version 1.0.

Target ISA Notes

Supported on all target architectures.

Examples

xor.b32  d,q,r;\nxor.b16  d,x,0x0001;\n

", "tooltip": "Bitwise exclusive-OR (inequality).\n\nSyntax\n\nxor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise exclusive-or operation for the bits in a and b.\n\nSemantics\n\nd = a ^ b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nxor.b32 d,q,r;\n\nxor.b16 d,x,0x0001;\n\n ...", "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor" }; } }