lib/asm-docs/generated/asm-docs-ptx.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331

import {AssemblyInstructionInfo} from '../base.js';

export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
    if (!opcode) return;
    switch (opcode) {
        case "abs":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs\" target=\"_blank\" rel=\"noopener noreferrer\">abs(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs\" target=\"_blank\" rel=\"noopener noreferrer\">abs(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-abs\" target=\"_blank\" rel=\"noopener noreferrer\">abs(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: abs</h1><section id=\"floating-point-instructions-abs\">\n\n\n<p>Absolute value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>abs{.ftz}.f32  d, a;\nabs.f64        d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take the absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = |a|;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> input yields unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. For <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> input is passed\nthrough unchanged. Future implementations may comply with the IEEE 754 standard by preserving\npayload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>abs.ftz.f32  x,f0;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: abs</h1><section id=\"half-precision-floating-point-instructions-abs\">\n\n\n<p>Absolute value</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>abs{.ftz}.f16    d, a;\nabs{.ftz}.f16x2  d, a;\nabs.bf16         d, a;\nabs.bf16x2       d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vector by extracting half word values\nfrom the source operand. Absolute values of half-word operands are then computed in parallel to\nproduce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = |a|;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = |fA[i]|;\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">abs.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs yield an unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.5.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16x2</span></code> introduced in PTX ISA 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">abs.bf16x2</span></code> requires architecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>abs.ftz.f16  x,f0;\nabs.bf16     x,b0;\nabs.bf16x2   x1,b1;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: abs</h1><section id=\"integer-arithmetic-instructions-abs\">\n\n\n<p>Absolute value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>abs.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take the absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store it in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = |a|;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Only for signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>abs.s32  r0,a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: abs\n\n\n\nAbsolute value.\n\nSyntax\n\nabs{.ftz}.f32  d, a;\n\nabs.f64        d, a;\n\nDescription\n\nTake the absolute value of a and store the result in d.\n\nSemantics\n\nd = |a|;\n\nNotes\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nabs.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xabs.f64 supports subnormal numbers.\n\nabs.f32 flushes subnormal inputs and results to sign-pres...\n\n=====Half Precision Floating Point Instructions: abs\n\n\n\nAbsolute value\n\nSyntax\n\nabs{.ftz}.f16    d, a;\n\nabs{.ftz}.f16x2  d, a;\n\nabs.bf16         d, a;\n\nabs.bf16x2       d, a;\n\nDescription\n\nTake absolute value of a and store the result in d.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\n\nfrom the source operand. Absolute values of half-word operands are then computed in parallel to\n\nproduce .f16x2 or .bf16x2 result in...\n\n=====Integer Arithmetic Instructions: abs\n\n\n\nAbsolute value.\n\nSyntax\n\nabs.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n\nDescription\n\nTake the absolute value of a and store it in d.\n\nSemantics\n\nd = |a|;\n\nNotes\n\nOnly for signed integers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nabs.s32  r0,a;\n\n... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-abs"
            };

        case "activemask":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask\" target=\"_blank\" rel=\"noopener noreferrer\">activemask <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: activemask</h1><section id=\"parallel-synchronization-and-communication-instructions-activemask\">\n\n\n<p>Queries the active threads within a warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>activemask.b32 d;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">activemask</span></code> queries predicated-on active threads from the executing warp and sets the destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with 32-bit integer mask where bit position in the mask corresponds to the thread\u2019s\n<code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>.</p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a 32-bit destination register.</p>\n<p>An active thread will contribute 1 for its entry in the result and exited or inactive or\npredicated-off thread will contribute 0 for its entry in the result.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>activemask.b32  %r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Queries the active threads within a warp.\n\nSyntax\n\nactivemask.b32 d;\n\nDescription\n\nactivemask queries predicated-on active threads from the executing warp and sets the destination\n\nd with 32-bit integer mask where bit position in the mask corresponds to the thread\u2019s\n\nlaneid.\n\nDestination d is a 32-bit destination register.\n\nAn active thread will contribute 1 for its entry in the result and exited or inactive or\n\npredicated-off thread will contribute 0 for its entry in the result.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.2.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nactivemask.b32  %r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask"
            };

        case "add":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add\" target=\"_blank\" rel=\"noopener noreferrer\">add(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add\" target=\"_blank\" rel=\"noopener noreferrer\">add(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add\" target=\"_blank\" rel=\"noopener noreferrer\">add(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-add-cc\" target=\"_blank\" rel=\"noopener noreferrer\">add.cc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: add</h1><section id=\"floating-point-instructions-add\">\n\n\n<p>Add two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>add{.rnd}{.ftz}{.sat}.f32  d, a, b;\nadd{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs addition and writes the resulting value into a destination register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a + b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that an <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. An <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>available for all targets</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">add.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">add.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n</dd>\n</dl>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  add.rz.ftz.f32  f1,f2,f3;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: add</h1><section id=\"half-precision-floating-point-instructions-add\">\n\n\n<p>Add two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>add{.rnd}{.ftz}{.sat}.f16   d, a, b;\nadd{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nadd{.rnd}.bf16   d, a, b;\nadd{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs addition and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then added in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result\nin destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>\ninstruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a + b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] + fB[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that an <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. An <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.</p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">add.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16x2</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add{.rnd}.bf16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// scalar f16 additions\nadd.f16        d0, a0, b0;\nadd.rn.f16     d1, a1, b1;\nadd.bf16       bd0, ba0, bb0;\nadd.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 addition\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nadd.f16x2  p3, p1, p2;   // SIMD f16x2 addition\n\n// SIMD bf16 addition\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nadd.bf16x2  p6, p4, p5;       // SIMD bf16x2 addition\n\n// SIMD fp16 addition\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nadd.f16x2       f2, f0, f1;     // SIMD f16x2 addition\n\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nadd.bf16x2      f5, f3, f4;      // SIMD bf16x2 addition\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: add</h1><section id=\"integer-arithmetic-instructions-add\">\n\n\n<p>Add two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>add.type       d, a, b;\nadd{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64,\n          .u16x2, .s16x2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs addition and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then added in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> result in\ndestination.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>. For instruction types <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code>,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = iA[i] + iB[i];\n    }\n} else {\n    d = a + b;\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt>.sat</dt><dd><p>limits result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow) for the size of the operation. Applies only to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.u16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add.s16x2</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">add.u16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">add.s16x2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  add.u32     x,y,z;\n    add.sat.s32 c,c,1;\n    add.u16x2   u,v,w;\n</pre></div>\n</div>\n</section>\n<h1>Extended-Precision Arithmetic Instructions: add.cc</h1><section id=\"extended-precision-arithmetic-instructions-add-cc\">\n\n\n<p>Add two values with carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>add.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs integer addition and writes the carry-out value into the condition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a + b;\n</pre></div>\n</div>\n<p>carry-out written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>No integer rounding modifiers.</p>\n<p>No saturation.</p>\n<p>Behavior is the same for unsigned and signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> introduced in PTX ISA version 1.2.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> is supported on all target architectures.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">add.cc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n@p  addc.cc.u32  x3,y3,z3;\n@p  addc.u32     x4,y4,z4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd{.rnd}{.ftz}{.sat}.f32  d, a, b;\n\nadd{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nSemantics\n\nd = a + b;\n\nNotes\n\nRounding modifiers:\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds towards negative infinity\n\n.rpmantissa L...\n\n=====Half Precision Floating Point Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd{.rnd}{.ftz}{.sat}.f16   d, a, b;\n\nadd{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nadd{.rnd}.bf16   d, a, b;\n\nadd{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then added in paral...\n\n=====Integer Arithmetic Instructions: add\n\n\n\nAdd two values.\n\nSyntax\n\nadd.type       d, a, b;\n\nadd{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64,\n\n          .u16x2, .s16x2 };\n\nDescription\n\nPerforms addition and writes the resulting value into a destination register.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are...\n\n=====Extended-Precision Arithmetic Instructions: add.cc\n\n\n\nAdd two values with carry-out.\n\nSyntax\n\nadd.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer addition and writes the carry-out value into the condition code register.\n\nSemantics\n\nd = a + b;\n\ncarry-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit add.cc introduced in PTX ... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add"
            };

        case "addc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc\" target=\"_blank\" rel=\"noopener noreferrer\">addc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Extended-Precision Arithmetic Instructions: addc</h1><section id=\"extended-precision-arithmetic-instructions-addc\">\n\n\n<p>Add two values with carry-in and optional carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>addc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs integer addition with carry-in and optionally writes the carry-out value into the condition\ncode register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a + b + CC.CF;\n</pre></div>\n</div>\n<p>if <code class=\"docutils literal notranslate\"><span class=\"pre\">.cc</span></code> specified, carry-out written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>No integer rounding modifiers.</p>\n<p>No saturation.</p>\n<p>Behavior is the same for unsigned and signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> introduced in PTX ISA version 1.2.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> is supported on all target architectures.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n@p  addc.cc.u32  x3,y3,z3;\n@p  addc.u32     x4,y4,z4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Add two values with carry-in and optional carry-out.\n\nSyntax\n\naddc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer addition with carry-in and optionally writes the carry-out value into the condition\n\ncode register.\n\nSemantics\n\nd = a + b + CC.CF;\n\nif .cc specified, carry-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit addc introduced in PTX ISA version 1.2.\n\n64-bit addc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\n32-bit addc is supported on all target architectures.\n\n64-bit addc requires sm_20 or higher.\n\nExamples\n\n@p  add.cc.u32   x1,y1,z1;   // extended-precision addition of\n\n@p  addc.cc.u32  x2,y2,z2;   // two 128-bit values\n\n@p  addc.cc.u32  x3,y3,z3;\n\n@p  addc.u32     x4,y4,z4;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-addc"
            };

        case "address_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-address-size\" target=\"_blank\" rel=\"noopener noreferrer\">address_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>PTX Module Directives: .address_size</h1><section id=\"ptx-module-directives-address-size\">\n\n\n<p>Address size used throughout PTX module.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.address_size  address-size\naddress-size = { 32, 64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Specifies the address size assumed throughout the module by the PTX code and the binary DWARF\ninformation in PTX.</p>\n<p>Redefinition of this directive within a module is not allowed. In the presence of separate\ncompilation all modules must specify (or default to) the same address size.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.address_size</span></code> directive is optional, but it must immediately follow the <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code>directive if present within a module.</p>\n<p><strong>Semantics</strong></p>\n<p>If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.address_size</span></code> directive is omitted, the address size defaults to 32.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// example directives\n   .address_size 32       // addresses are 32 bit\n   .address_size 64       // addresses are 64 bit\n\n// example of directive placement within a module\n   .version 2.3\n   .target sm_20\n   .address_size 64\n...\n.entry foo () {\n...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Address size used throughout PTX module.\n\nSyntax\n\n.address_size  address-size\n\naddress-size = { 32, 64 };\n\nDescription\n\nSpecifies the address size assumed throughout the module by the PTX code and the binary DWARF\n\ninformation in PTX.\n\nRedefinition of this directive within a module is not allowed. In the presence of separate\n\ncompilation all modules must specify (or default to) the same address size.\n\nThe .address_size directive is optional, but it must immediately follow the .targetdirective if present within a module.\n\nSemantics\n\nIf the .address_size directive is omitted, the address size defaults to 32.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n// example directives\n\n   .address_size 32       // addresses are 32 bit\n\n   .address_size 64       // addresses are 64 bit\n\n// example of directive placement within a module\n\n   .version 2.3\n\n   .target sm_20\n\n   .address_size 64\n\n...\n\n.entry foo () {\n\n...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-address-size"
            };

        case "aggr_smem_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size\" target=\"_blank\" rel=\"noopener noreferrer\">aggr_smem_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %aggr_smem_size</h1><section id=\"special-registers-aggr-smem-size\">\n\n\n<p>Total size of shared memory used by a CTA of a kernel.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %aggr_smem_size;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with total aggregated size of shared memory\nconsisting of the size of user shared memory allocated (statically and dynamically) at launch time\nand the size of shared memory region which is reserved for the NVIDIA system software use.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %aggr_smem_size;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Total size of shared memory used by a CTA of a kernel.\n\nSyntax (predefined)\n\n.sreg .u32 %aggr_smem_size;\n\nDescription\n\nA predefined, read-only special register initialized with total aggregated size of shared memory\n\nconsisting of the size of user shared memory allocated (statically and dynamically) at launch time\n\nand the size of shared memory region which is reserved for the NVIDIA system software use.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.1.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmov.u32  %r, %aggr_smem_size;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-aggr-smem-size"
            };

        case "alias":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-alias\" target=\"_blank\" rel=\"noopener noreferrer\">alias <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Kernel and Function Directives: .alias</h1><section id=\"kernel-and-function-directives-alias\">\n\n\n<p>Define an alias to existing function symbol.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.alias fAlias, fAliasee;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> is a module scope directive that defines identifier <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> to be an alias to function\nspecified by <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code>.</p>\n<p>Both <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> are non-entry function symbols.</p>\n<p>Identifier <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> is a function declaration without body.</p>\n<p>Identifier <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> is a function symbol which must be defined in the same module as <code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code>\ndeclaration. Function <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> cannot have <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> linkage.</p>\n<p>Prototype of <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code> must match.</p>\n<p>Program can use either <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlias</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">fAlisee</span></code> identifiers to reference function defined with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fAliasee</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> directive introduced in PTX ISA 6.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.visible .func foo(.param .u32 p) {\n   ...\n}\n.visible .func bar(.param .u32 p);\n.alias bar, foo;\n.entry test()\n{\n      .param .u32 p;\n      ...\n      call foo, (p);       // call foo directly\n       ...\n       .param .u32 p;\n       call bar, (p);        // call foo through alias\n}\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n{\n    .reg .b32 %r1, %r2, %r3;\n    ld.param.b32  %r1, [x];\n    ld.param.b32  %r2, [y];\n    ld.param.b32  %r3, [z];\n    ...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Define an alias to existing function symbol.\n\nSyntax\n\n.alias fAlias, fAliasee;\n\nDescription\n\n.alias is a module scope directive that defines identifier fAlias to be an alias to function\n\nspecified by fAliasee.\n\nBoth fAlias and fAliasee are non-entry function symbols.\n\nIdentifier fAlias is a function declaration without body.\n\nIdentifier fAliasee is a function symbol which must be defined in the same module as .alias\n\ndeclaration. Function fAliasee cannot have .weak linkage.\n\nPrototype of fAlias and fAliasee must match.\n\nProgram can use either fAlias or fAlisee identifiers to reference function defined with\n\nfAliasee.\n\nPTX ISA Notes\n\n.alias directive introduced in PTX ISA 6.3.\n\nTarget ISA Notes\n\n.alias directive requires sm_30 or higher.\n\nExamples\n\n.visible .func foo(.param .u32 p) {\n\n   ...\n\n}\n\n.visible .func bar(.param .u32 p);\n\n.alias bar, foo;\n\n.entry test()\n\n{\n\n      .param .u32 p;\n\n      ...\n\n      call foo, (p);       // call foo directly\n\n       ...\n\n       .param .u32 p;\n\n       call bar, (p);        // call foo through alias\n\n}\n\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n\n{\n\n    .reg .b32 %r1, %r2, %r3;\n\n    ld.param.b32  %r1, [x];\n\n    ld.param.b32  %r2, [y];\n\n    ld.param.b32  %r3, [z];\n\n    ...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-alias"
            };

        case "alloca":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca\" target=\"_blank\" rel=\"noopener noreferrer\">alloca <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Stack Manipulation Instructions: alloca</h1><section id=\"stack-manipulation-instructions-alloca\">\n\n\n<p>Dynamically allocate memory on stack.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>alloca.type  ptr, size{, immAlign};\n\n.type = { .u32, .u64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> instruction dynamically allocates memory on the stack frame of the current function\nand updates the stack pointer accordingly. The returned pointer <code class=\"docutils literal notranslate\"><span class=\"pre\">ptr</span></code> points to local memory and\ncan be used in the address operand of <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.local</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">st.local</span></code> instructions.</p>\n<p>If sufficient memory is unavailable for allocation on the stack, then execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> may\nresult in stack overflow. In such cases, attempting to access the allocated memory with <code class=\"docutils literal notranslate\"><span class=\"pre\">ptr</span></code> will\nresult in undefined program behavior.</p>\n<p>The memory allocated by <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> is deallocated in the following ways:</p>\n<ul class=\"simple\">\n<li><p>It is automatically deallocated when the function exits.</p></li>\n<li><p>It can be explicitly deallocated using <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> instructions:\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> can be used to save the value of stack pointer before executing <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> can be used after <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> to restore stack pointer to the original value which\nwas previously saved with <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code>. Note that accessing deallocated memory after executing\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> results in undefined behavior.</p></li>\n</ul>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> is an unsigned value which specifies the amount of memory in number of bytes to be\nallocated on stack. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span> <span class=\"pre\">=</span> <span class=\"pre\">0</span></code> may not lead to a valid memory allocation.</p>\n<p>Both <code class=\"docutils literal notranslate\"><span class=\"pre\">ptr</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> have the same type as the instruction type.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">immAlign</span></code> is a 32-bit value which specifies the alignment requirement in number of bytes for the\nmemory allocated by <code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code>. It is an integer constant, must be a power of 2 and must not exceed\n2^23. <code class=\"docutils literal notranslate\"><span class=\"pre\">immAlign</span></code> is an optional argument with default value being 8 which is the minimum\nguaranteed alignment.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>alloca.type ptr, size, immAlign:\n\na = max(immAlign, frame_align); // frame_align is the minimum guaranteed alignment\n\n// Allocate size bytes of stack memory with alignment a and update the stack pointer.\n// Since the stack grows down, the updated stack pointer contains a lower address.\nstackptr = alloc_stack_mem(size, a);\n\n// Return the new value of stack pointer as ptr. Since ptr is the lowest address of the memory\n// allocated by alloca, the memory can be accessed using ptr up to (ptr + size of allocated memory).\nstacksave ptr;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.3.</p>\n<dl class=\"simple\">\n<dt>Preview Feature:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> is a preview feature in PTX ISA version 7.3. All details are subject to change with no\nguarantees of backward compatibility on future PTX ISA versions or SM architectures.</p>\n</dd>\n</dl>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">alloca</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_52</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .u32 ra, stackptr, ptr, size;\n\nstacksave.u32 stackptr;     // Save the current stack pointer\nalloca ptr, size, 8;        // Allocate stack memory\nst.local.u32 [ptr], ra;     // Use the allocated stack memory\nstackrestore.u32 stackptr;  // Deallocate memory by restoring the stack pointer\n</pre></div>\n</div>\n</section>",
                "tooltip": "Dynamically allocate memory on stack.\n\nSyntax\n\nalloca.type  ptr, size{, immAlign};\n\n.type = { .u32, .u64 };\n\nDescription\n\nThe alloca instruction dynamically allocates memory on the stack frame of the current function\n\nand updates the stack pointer accordingly. The returned pointer ptr points to local memory and\n\ncan be used in the address operand of ld.local and st.local instructions.\n\nIf sufficient memory is unavailable for allocation on the stack, then execution of alloca may\n\nresult in stack overflow. In such cases, attempting to access the allocated memory with ptr will\n\nresult in undefined program behavior.\n\nThe memory allocated by alloca is deallocated in the following ways:\n\nIt is automatically deallocated when the function exits.\n\nIt can be explicitly deallocated using stacksave and stackrestore instructions:\n\nstacksave can be used to save the value of stack pointer before executing alloca, and\n\nstackrestore can be used after alloca to restore stack pointer to the original value which\n\nwas previously saved with stacksave. Note that accessing deallocated memory after executing\n\nstackrestore results in undefined behavior.\n\nsize is an unsigned value which specifies the amount of memory in number of bytes to be\n\nallocated on stack. size = 0 may not lead to a valid memory allocation.\n\nBoth ptr and size have the same type as the instruction type.\n\nimmAlign is a 32-bit value which specifies the alignment requirement in number of bytes for the\n\nmemory allocated by alloca. It is an integer constant, must be a power of 2 and must not exceed\n\n2^23. immAlign is an optional argument with default value being 8 which is the minimum\n\nguaranteed alignment.\n\nSemantics\n\nalloca.type ptr, size, immAlign:\n\na = max(immAlign, frame_align); // frame_align is the minimum guaranteed alignment\n\n// Allocate size bytes of stack memory with alignment a and update the stack pointer.\n\n// Since the stack grows down, the updated stack pointer contains a lower address.\n\nstackptr = alloc_stack_mem(size, a);\n\n// Return the new value of stack pointer as ptr. Since ptr is the lowest address of the memory\n\n// allocated by alloca, the memory can be accessed using ptr up to (ptr + size of allocated memory).\n\nstacksave ptr;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:alloca is a preview feature in PTX ISA version 7.3. All details are subject to change with no\n\nguarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nalloca requires sm_52 or higher.\n\nExamples\n\n.reg .u32 ra, stackptr, ptr, size;\n\nstacksave.u32 stackptr;     // Save the current stack pointer\n\nalloca ptr, size, 8;        // Allocate stack memory\n\nst.local.u32 [ptr], ra;     // Use the allocated stack memory\n\nstackrestore.u32 stackptr;  // Deallocate memory by restoring the stack pointer\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca"
            };

        case "and":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and\" target=\"_blank\" rel=\"noopener noreferrer\">and <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: and</h1><section id=\"logic-and-shift-instructions-and\">\n\n\n<p>Bitwise AND.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>and.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the bit-wise and operation for the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a &amp; b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicate registers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>and.b32  x,q,r;\nand.b32  sign,fpvalue,0x80000000;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bitwise AND.\n\nSyntax\n\nand.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise and operation for the bits in a and b.\n\nSemantics\n\nd = a & b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nand.b32  x,q,r;\n\nand.b32  sign,fpvalue,0x80000000;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-and"
            };

        case "applypriority":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority\" target=\"_blank\" rel=\"noopener noreferrer\">applypriority <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: applypriority</h1><section id=\"data-movement-and-conversion-instructions-applypriority\">\n\n\n<p>Apply the cache eviction priority to the specified address in the specified cache level.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>appplypriority{.global}.level::eviction_priority  [a], size;\n\n.level::eviction_priority = { .L2::evict_normal };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">applypriority</span></code> instruction applies the cache eviction priority specified by the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier to the address range <code class=\"docutils literal notranslate\"><span class=\"pre\">[a..a+size)</span></code> in the specified cache\nlevel.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the specified address does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space\nthen the behavior is undefined.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> is an integer constant that specifies the amount of data, in bytes, in the\nspecified cache level on which the priority is to be applied. The only supported value for the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> operand is 128.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be aligned to 128 bytes.</p>\n<p>If the data pointed to by address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is not already present in the specified cache level, then\nthe data will be prefetched before applying the specified priority.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>applypriority.global.L2::evict_normal [ptr], 128;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Apply the cache eviction priority to the specified address in the specified cache level.\n\nSyntax\n\nappplypriority{.global}.level::eviction_priority  [a], size;\n\n.level::eviction_priority = { .L2::evict_normal };\n\nDescription\n\nThe applypriority instruction applies the cache eviction priority specified by the\n\n.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache\n\nlevel.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nThe operand size is an integer constant that specifies the amount of data, in bytes, in the\n\nspecified cache level on which the priority is to be applied. The only supported value for the\n\nsize operand is 128.\n\nSupported addressing modes for operand a are described in Addresses as Operands. a must be aligned to 128 bytes.\n\nIf the data pointed to by address a is not already present in the specified cache level, then\n\nthe data will be prefetched before applying the specified priority.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\napplypriority.global.L2::evict_normal [ptr], 128;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-applypriority"
            };

        case "atom":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom\" target=\"_blank\" rel=\"noopener noreferrer\">atom <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: atom</h1><section id=\"parallel-synchronization-and-communication-instructions-atom\">\n\n\n<p>Atomic reduction operations for thread-to-thread communication.</p>\n<p><strong>Syntax</strong></p>\n<p>Atomic operation with scalar type:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.op.type d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b16 d, [a], b, c;\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16     d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2   d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16    d, [a], b{, cache-policy};\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2  d, [a], b{, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n.sem =                { .relaxed, .acquire, .release, .acq_rel };\n.scope =              { .cta, .cluster, .gpu, .sys };\n\n.op =                 { .and, .or, .xor,\n                        .cas, .exch,\n                        .add, .inc, .dec,\n                        .min, .max };\n.level::cache_hint =  { .L2::cache_hint };\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n</pre></div>\n</div>\n<p>Atomic operation with vector type:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>atom{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32                  d, [a], b{, cache-policy};\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_16_bit.half_word_type  d, [a], b{, cache-policy};\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type     d, [a], b{, cache-policy};\n\n.sem =               { .relaxed, .acquire, .release, .acq_rel };\n.scope =             { .cta, .gpu, .sys };\n.op =                { .add, .min, .max };\n.half_word_type =    { .f16, .bf16 };\n.packed_type =       { .f16x2, .bf16x2 };\n.vec_16_bit =        { .v2, .v4, .v8 }\n.vec_32_bit =        { .v2, .v4 };\n.level::cache_hint = { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Atomically loads the original value at location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> into destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, performs a\nreduction operation with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and the value in location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and stores the result of the\nspecified operation at location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, overwriting the original value. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> specifies a\nlocation in the specified state space. If no state space is given, perform the memory accesses using\n<a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with scalar type may be used only\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> spaces and with generic addressing, where the address points to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> space. <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with vector type may be used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space\nand with generic addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with vector type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are brace-enclosed vector expressions, size\nof which is equal to the size of vector qualifier.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is absent, <code class=\"docutils literal notranslate\"><span class=\"pre\">.gpu</span></code> scope is\nassumed by default.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> with vector type, the supported combinations of vector qualifier and types, and atomic\noperations supported on these combinations are depicted in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 19%\"/>\n<col style=\"width: 32%\"/>\n<col style=\"width: 32%\"/>\n<col style=\"width: 16%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\" rowspan=\"2\"><p>Vector qualifier</p></th>\n<th class=\"head\" colspan=\"3\"><p>Types</p></th>\n</tr>\n<tr class=\"row-even\"><th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">bf16</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">bf16x2</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code></p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v2</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v4</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v8</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p>Not supported</p></td>\n<td><p>Not Supported</p></td>\n</tr>\n</tbody>\n</table>\n<p>Two atomic operations {<code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code>} are performed atomically with respect to each other only\nif each operation specifies a scope that includes the other. When this condition is not met, each\noperation observes the other operation being performed as if it were split into a read followed by a\ndependent write.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\nsuch cases, the atomicity is guaranteed separately for each of the individual scalar elements; the\nentire <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> is not guaranteed to be atomic as a single access.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> and earlier architectures, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> operations on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space do not\nguarantee atomicity with respect to normal store instructions to the same address. It is the\nprogrammer\u2019s responsibility to guarantee correctness of programs that use shared memory atomic\ninstructions, e.g., by inserting barriers between normal stores and atomic operations to a common\naddress, or by using atom.exch to store to locations accessed by other atomic operations.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The bit-size operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.cas</span></code> (compare-and-swap), and <code class=\"docutils literal notranslate\"><span class=\"pre\">.exch</span></code>\n(exchange).</p>\n<p>The integer operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> operations return a result in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">[0..b]</span></code>.</p>\n<p>The floating-point operation <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code> operation rounds to nearest even. Current implementation of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> on global memory flushes subnormal inputs and results to sign-preserving zero;\nwhereas <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> on shared memory supports subnormal inputs and results and doesn\u2019t flush\nthem to zero.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.bf16x2</span></code> operation requires\nthe <code class=\"docutils literal notranslate\"><span class=\"pre\">.noftz</span></code> qualifier; it preserves subnormal inputs and results, and does not flush them to\nzero.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and for generic\naddressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>atomic {\n    d = *a;\n    *a = (operation == cas) ? operation(*a, b, c)\n                            : operation(*a, b);\n}\nwhere\n    inc(r, s)  = (r &gt;= s) ? 0 : r+1;\n    dec(r, s)  = (r==0 || r &gt; s)  ? s : r-1;\n    exch(r, s) =  s;\n    cas(r,s,t) = (r == s) ? t : r;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Simple reductions may be specified by using the <em>bit bucket</em> destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">_</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit atom.global introduced in PTX ISA version 1.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared</span></code> and 64-bit<code class=\"docutils literal notranslate\"><span class=\"pre\">atom.global.{add,cas,exch}</span></code> introduced in PTX ISA 1.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> and 64-bit<code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared.{add,cas,exch}</span></code> introduced in PTX ISA 2.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.{and,or,xor,min,max}</span></code> introduced in PTX ISA 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f64</span></code> introduced in PTX ISA 5.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier introduced in PTX ISA 5.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16x2</span></code> introduced in PTX ISA 6.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.cas.b16</span></code> introduced in PTX ISA 6.3.</p>\n<p>Per-element atomicity of <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.f16x2</span></code> clarified in PTX ISA version 6.3, with retrospective effect\nfrom PTX ISA version 6.2.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier introduced in PTX ISA version 7.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16x2</span></code> introduced in PTX ISA 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for vector types introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.global</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.global.{add,cas,exch}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.shared.{add,cas,exch}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.{and,or,xor,min,max}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Use of generic addressing requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.cas.b16</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.add.noftz.bf16x2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for vector types requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>atom.global.add.s32  d,[a],1;\natom.shared::cta.max.u32  d,[x+4],0;\n@p  atom.global.cas.b32  d,[p],my_val,my_new_val;\natom.global.sys.add.u32 d, [a], 1;\natom.global.acquire.sys.inc.u32 ans, [gbl], %r0;\natom.add.noftz.f16x2 d, [a], b;\natom.add.noftz.f16   hd, [ha], hb;\natom.global.cas.b16  hd, [ha], hb, hc;\natom.add.noftz.bf16   hd, [a], hb;\natom.add.noftz.bf16x2 bd, [b], bb;\natom.add.shared::cluster.noftz.f16   hd, [ha], hb;\n\natom.global.cluster.relaxed.add.u32 d, [a], 1;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\natom.global.add.L2::cache_hint.s32  d, [a], 1, cache-policy;\n\natom.global.v8.f16.max.noftz  {%hd0, %hd1, %hd2, %hd3, %hd4, %hd5, %hd6, %hd7}, [gbl],\n                                              {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\natom.global.v8.bf16.add.noftz  {%hd0, %hd1, %hd2, %hd3, %hd4, %hd5, %hd6, %hd7}, [gbl],\n                                              {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\natom.global.v2.f16.add.noftz  {%hd0, %hd1}, [gbl], {%h0, %h1};\natom.global.v2.bf16.add.noftz  {%hd0, %hd1}, [gbl], {%h0, %h1};\natom.global.v4.b16x2.min.noftz  {%hd0, %hd1, %hd2, %hd3}, [gbl], {%h0, %h1, %h2, %h3};\natom.global.v4.f32.add  {%f0, %f1, %f2, %f3}, [gbl], {%f0, %f1, %f2, %f3};\natom.global.v2.f16x2.min.noftz  {%bd0, %bd1}, [g], {%b0, %b1};\natom.global.v2.bf16x2.max.noftz  {%bd0, %bd1}, [g], {%b0, %b1};\natom.global.v2.f32.add  {%f0, %f1}, [g], {%f0, %f1};\n</pre></div>\n</div>\n</section>",
                "tooltip": "Atomic reduction operations for thread-to-thread communication.\n\nSyntax\n\nAtomic operation with scalar type:\n\natom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.op.type d, [a], b, c;\n\natom{.sem}{.scope}{.space}.cas.b16 d, [a], b, c;\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16     d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2   d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16    d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2  d, [a], b{, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n\n.sem =                { .relaxed, .acquire, .release, .acq_rel };\n\n.scope =              { .cta, .cluster, .gpu, .sys };\n\n.op =                 { .and, .or, .xor,\n\n                        .cas, .exch,\n\n                        .add, .inc, .dec,\n\n                        .min, .max };\n\n.level::cache_hint =  { .L2::cache_hint };\n\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n\nAtomic operation with vector type:\n\natom{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32                  d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_16_bit.half_word_type  d, [a], b{, cache-policy};\n\natom{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type     d, [a], b{, cache-policy};\n\n.sem =               { .relaxed, .acquire, .release, .acq_rel };\n\n.scope =             { .cta, .gpu, .sys };\n\n.op =                { .add, .min, .max };\n\n.half_word_type =    { .f16, .bf16 };\n\n.packed_type =       { .f16x2, .bf16x2 };\n\n.vec_16_bit =        { .v2, .v4, .v8 }\n\n.vec_32_bit =        { .v2, .v4 };\n\n.level::cache_hint = { .L2::cache_hint }\n\nDescription\n\nAtomically loads the original value at location a into destination register d, performs a\n\nreduction operation with operand b and the value in location a, and stores the result of the\n\nspecified operation at location a, overwriting the original value. Operand a specifies a\n\nlocation in the specified state space. If no state space is given, perform the memory accesses using\n\nGeneric Addressing. atom with scalar type may be used only\n\nwith .global and .shared spaces and with generic addressing, where the address points to\n\n.global or .shared space. atom with vector type may be used only with .global space\n\nand with generic addressing where the address points to .global space.\n\nFor atom with vector type, operands d and b are brace-enclosed vector expressions, size\n\nof which is equal to the size of vector qualifier.\n\nIf no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.\n\nThe optional .sem qualifier specifies a memory synchronizing effect as described in the Memory\n\nConsistency Model. If the .sem qualifier is absent,\n\n.relaxed is assumed by default.\n\nThe optional .scope qualifier specifies the set of threads that can directly observe the memory\n\nsynchronizing effect of this operation, as described in the Memory Consistency Model. If the .scope qualifier is absent, .gpu scope is\n\nassumed by default.\n\nFor atom with vector type, the supported combinations of vector qualifier and types, and atomic\n\noperations supported on these combinations are depicted in the following table:\n\n\n\n\n\nVector qualifier\n\nTypes\n\n.f16/ bf16\n\n.f16x2/ bf16x2\n\n.f32\n\n.v2\n\n.add, .min, .max\n\n.add, .min, .max\n\n.add\n\n.v4\n\n.add, .min, .max\n\n.add, .min, .max\n\n.add\n\n.v8\n\n.add, .min, .max\n\nNot supported\n\nNot Supported\n\nTwo atomic operations {atom or red} are performed atomically with respect to each other only\n\nif each operation specifies a scope that includes the other. When this condition is not met, each\n\noperation observes the other operation being performed as if it were split into a read followed by a\n\ndependent write.\n\natom instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\n\nsuch cases, the atomicity is guaranteed separately fo ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom"
            };

        case "bar":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier\" target=\"_blank\" rel=\"noopener noreferrer\">bar <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync\" target=\"_blank\" rel=\"noopener noreferrer\">bar.warp.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: bar, barrier</h1><section id=\"parallel-synchronization-and-communication-instructions-bar-barrier\">\n<span id=\"parallel-synchronization-and-communication-instructions-bar\"></span>\n\n<p>Barrier synchronization.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>barrier{.cta}.sync{.aligned}      a{, b};\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs barrier synchronization and communication within a CTA. Each CTA instance has sixteen\nbarriers numbered <code class=\"docutils literal notranslate\"><span class=\"pre\">0..15</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions can be used by the threads within the CTA for synchronization and\ncommunication.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>; operands <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are predicates. Source\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> specifies a logical barrier resource as an immediate constant or register with value\n<code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> through <code class=\"docutils literal notranslate\"><span class=\"pre\">15</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> specifies the number of threads participating in the barrier. If\nno thread count is specified, all threads in the CTA participate in the barrier. When specifying a\nthread count, the value must be a multiple of the warp size. Note that a non-zero thread count is\nrequired for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>.</p>\n<p>Depending on operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, either specified number of threads (in multiple of warp size) or all\nthreads in the CTA participate in <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions\nsignal the arrival of the executing threads at the named barrier.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction causes executing thread to wait for all non-exited threads from its\nwarp and marks warps\u2019 arrival at barrier. In addition to signaling its arrival at the barrier, the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> instructions causes executing thread to wait for\nnon-exited threads of all other warps participating in the barrier to\narrive. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> does not cause executing thread to wait for threads of other\nparticipating warps.</p>\n<p>When a barrier completes, the waiting threads are restarted without delay, and the barrier is\nreinitialized so that it can be immediately reused.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> instruction\nguarantees that when the barrier completes, prior memory accesses requested by this thread are\nperformed relative to all threads participating in the barrier. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> instruction further guarantees that no new memory access is requested by this\nthread before the barrier completes.</p>\n<p>A memory read (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread participating in the barrier. A\nmemory write (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value written has\nbecome visible to other threads participating in the barrier, that is, when the previous value can\nno longer be read.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> performs a reduction operation across threads. The <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> predicate (or its\ncomplement) from all threads in the CTA are combined using the specified reduction operator. Once\nthe barrier count is reached, the final value is written to the destination register in all threads\nwaiting at the barrier.</p>\n<p>The reduction operations for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> are population-count (<code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code>),\nall-threads-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>), and any-thread-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>). The result of <code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code> is the number of\nthreads with a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate, while <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code> indicate if all the threads had a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate or if any of the threads had a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> has optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier. When specified, it indicates that\nall threads in CTA will execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. In conditionally executed\ncode, an aligned <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction should only be used if it is known that all threads\nin CTA evaluate the condition identically, otherwise behavior is undefined.</p>\n<p>Different warps may execute different forms of the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction using the same\nbarrier name and thread count. One example mixes <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>\nto implement producer/consumer models. The producer threads execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> to\nannounce their arrival at the barrier and continue execution without delay to produce the next\nvalue, while the consumer threads execute the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to be\nproduced. The roles are then reversed, using a different barrier, where the producer threads execute\na <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to consumed, while the consumer threads announce\nthat the resource has been consumed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>. Care must be taken to keep a warp\nfrom executing more <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions than intended (<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> followed\nby any other <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction to the same barrier) prior to the reset of the\nbarrier. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> should not be intermixed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> using the same active barrier. Execution in this case is unpredictable.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier simply indicates CTA-level applicability of the barrier and it\ndoesn\u2019t change the semantics of the instruction.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.arrive</span></code> is\nequivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.red</span></code> is equivalent to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red.aligned</span></code>.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below,</p>\n<ol class=\"arabic simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction without <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code>\nvariant and has the same restrictions as of <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> variant.</p></li>\n<li><p>All threads in warp (except for those have exited) must execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction\nin convergence.</p></li>\n</ol>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.sync</span></code> without a thread count introduced in PTX ISA version 1.0.</p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.{arrive,red}</span></code> introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier</span></code> instruction introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.{arrive,red}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Only <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> with an immediate barrier number is supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for all threads in CTA to also arrive:\n    st.shared [r0],r1;  // write my result to shared memory\n    bar.cta.sync  1;    // arrive, wait for others to arrive\n    ld.shared r2,[r3];  // use shared results from other threads\n\n// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for fixed number of cooperating threads to arrive:\n    #define CNT1 (8*12) // Number of cooperating threads\n\n    st.shared [r0],r1;     // write my result to shared memory\n    bar.cta.sync  1, CNT1; // arrive, wait for others to arrive\n    ld.shared r2,[r3];     // use shared results from other threads\n\n// Use bar.red.and to compare results across the entire CTA:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.and.pred r3,1,p; // r3=AND(p) forall threads in CTA\n\n// Use bar.red.popc to compute the size of a group of threads\n// that have a specific condition True:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.popc.u32 r3,1,p; // r3=SUM(p) forall threads in CTA\n\n/* Producer/consumer model. The producer deposits a value in\n * shared memory, signals that it is complete but does not wait\n * using bar.arrive, and begins fetching more data from memory.\n * Once the data returns from memory, the producer must wait\n * until the consumer signals that it has read the value from\n * the shared memory location. In the meantime, a consumer\n * thread waits until the data is stored by the producer, reads\n * it, and then signals that it is done (without waiting).\n */\n    // Producer code places produced value in shared memory.\n    st.shared   [r0],r1;\n    bar.arrive  0,64;\n    ld.global   r1,[r2];\n    bar.sync    1,64;\n    ...\n\n    // Consumer code, reads value from shared memory\n    bar.sync   0,64;\n    ld.shared  r1,[r0];\n    bar.arrive 1,64;\n    ...\n\n    // Examples of barrier.cta.sync\n    st.shared         [r0],r1;\n    barrier.cta.sync  0;\n    ld.shared         r1, [r0];\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: bar.warp.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-bar-warp-sync\">\n\n\n<p>Barrier synchronization for threads in a warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bar.warp.sync      membermask;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> will cause executing thread to wait until all threads corresponding to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have executed a <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> with the same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value before resuming\nexecution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin barrier where the bit position corresponds to thread\u2019s <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>.</p>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> also guarantee memory ordering among threads participating in barrier. Thus,\nthreads within warp that wish to communicate via memory can store to memory, execute\n<code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code>, and then safely read values stored by other threads in warp.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below, all threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> must execute the same\n<code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> instruction in convergence, and only threads belonging to some <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>\ncan be active when the <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.warp.sync</span></code> instruction is executed. Otherwise, the behavior is\nundefined.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>st.shared.u32 [r0],r1;         // write my result to shared memory\nbar.warp.sync  0xffffffff;     // arrive, wait for others to arrive\nld.shared.u32 r2,[r3];         // read results written by other threads\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: bar, barrier\n\n\n\nBarrier synchronization.\n\nSyntax\n\nbarrier{.cta}.sync{.aligned}      a{, b};\n\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\n\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\n\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\n\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n\nDescription\n\nPerform...\n\n=====Parallel Synchronization and Communication Instructions: bar.warp.sync\n\n\n\nBarrier synchronization for threads in a warp.\n\nSyntax\n\nbar.warp.sync      membermask;\n\nDescription\n\nbar.warp.sync will cause executing thread to wait until all threads corresponding to\n\nmembermask have executed a bar.warp.sync with the same membermask value before resuming\n\nexecution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin barrier where the bit... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier"
            };

        case "barrier":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier\" target=\"_blank\" rel=\"noopener noreferrer\">barrier <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster\" target=\"_blank\" rel=\"noopener noreferrer\">barrier.cluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: bar, barrier</h1><section id=\"parallel-synchronization-and-communication-instructions-bar-barrier\">\n<span id=\"parallel-synchronization-and-communication-instructions-bar\"></span>\n\n<p>Barrier synchronization.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>barrier{.cta}.sync{.aligned}      a{, b};\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs barrier synchronization and communication within a CTA. Each CTA instance has sixteen\nbarriers numbered <code class=\"docutils literal notranslate\"><span class=\"pre\">0..15</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions can be used by the threads within the CTA for synchronization and\ncommunication.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>; operands <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are predicates. Source\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> specifies a logical barrier resource as an immediate constant or register with value\n<code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> through <code class=\"docutils literal notranslate\"><span class=\"pre\">15</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> specifies the number of threads participating in the barrier. If\nno thread count is specified, all threads in the CTA participate in the barrier. When specifying a\nthread count, the value must be a multiple of the warp size. Note that a non-zero thread count is\nrequired for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>.</p>\n<p>Depending on operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, either specified number of threads (in multiple of warp size) or all\nthreads in the CTA participate in <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions\nsignal the arrival of the executing threads at the named barrier.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction causes executing thread to wait for all non-exited threads from its\nwarp and marks warps\u2019 arrival at barrier. In addition to signaling its arrival at the barrier, the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> instructions causes executing thread to wait for\nnon-exited threads of all other warps participating in the barrier to\narrive. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> does not cause executing thread to wait for threads of other\nparticipating warps.</p>\n<p>When a barrier completes, the waiting threads are restarted without delay, and the barrier is\nreinitialized so that it can be immediately reused.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> instruction\nguarantees that when the barrier completes, prior memory accesses requested by this thread are\nperformed relative to all threads participating in the barrier. The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> instruction further guarantees that no new memory access is requested by this\nthread before the barrier completes.</p>\n<p>A memory read (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread participating in the barrier. A\nmemory write (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value written has\nbecome visible to other threads participating in the barrier, that is, when the previous value can\nno longer be read.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> performs a reduction operation across threads. The <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> predicate (or its\ncomplement) from all threads in the CTA are combined using the specified reduction operator. Once\nthe barrier count is reached, the final value is written to the destination register in all threads\nwaiting at the barrier.</p>\n<p>The reduction operations for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> are population-count (<code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code>),\nall-threads-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>), and any-thread-True (<code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>). The result of <code class=\"docutils literal notranslate\"><span class=\"pre\">.popc</span></code> is the number of\nthreads with a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate, while <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code> indicate if all the threads had a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate or if any of the threads had a <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> predicate.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> has optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier. When specified, it indicates that\nall threads in CTA will execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction. In conditionally executed\ncode, an aligned <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction should only be used if it is known that all threads\nin CTA evaluate the condition identically, otherwise behavior is undefined.</p>\n<p>Different warps may execute different forms of the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction using the same\nbarrier name and thread count. One example mixes <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>\nto implement producer/consumer models. The producer threads execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> to\nannounce their arrival at the barrier and continue execution without delay to produce the next\nvalue, while the consumer threads execute the <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to be\nproduced. The roles are then reversed, using a different barrier, where the producer threads execute\na <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> to wait for a resource to consumed, while the consumer threads announce\nthat the resource has been consumed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code>. Care must be taken to keep a warp\nfrom executing more <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions than intended (<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> followed\nby any other <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction to the same barrier) prior to the reset of the\nbarrier. <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red</span></code> should not be intermixed with <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive</span></code> using the same active barrier. Execution in this case is unpredictable.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier simply indicates CTA-level applicability of the barrier and it\ndoesn\u2019t change the semantics of the instruction.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.sync.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.arrive</span></code> is\nequivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.arrive.aligned</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.red</span></code> is equivalent to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}.red.aligned</span></code>.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below,</p>\n<ol class=\"arabic simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction without <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> modifier is equivalent to <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code>\nvariant and has the same restrictions as of <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> variant.</p></li>\n<li><p>All threads in warp (except for those have exited) must execute <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction\nin convergence.</p></li>\n</ol>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bar.sync</span></code> without a thread count introduced in PTX ISA version 1.0.</p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar.{arrive,red}</span></code> introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier</span></code> instruction introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code> qualifier introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Register operands, thread count, and <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.{arrive,red}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Only <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}.sync</span></code> with an immediate barrier number is supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instruction requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for all threads in CTA to also arrive:\n    st.shared [r0],r1;  // write my result to shared memory\n    bar.cta.sync  1;    // arrive, wait for others to arrive\n    ld.shared r2,[r3];  // use shared results from other threads\n\n// Use bar.sync to arrive at a pre-computed barrier number and\n// wait for fixed number of cooperating threads to arrive:\n    #define CNT1 (8*12) // Number of cooperating threads\n\n    st.shared [r0],r1;     // write my result to shared memory\n    bar.cta.sync  1, CNT1; // arrive, wait for others to arrive\n    ld.shared r2,[r3];     // use shared results from other threads\n\n// Use bar.red.and to compare results across the entire CTA:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.and.pred r3,1,p; // r3=AND(p) forall threads in CTA\n\n// Use bar.red.popc to compute the size of a group of threads\n// that have a specific condition True:\n    setp.eq.u32 p,r1,r2;         // p is True if r1==r2\n    bar.cta.red.popc.u32 r3,1,p; // r3=SUM(p) forall threads in CTA\n\n/* Producer/consumer model. The producer deposits a value in\n * shared memory, signals that it is complete but does not wait\n * using bar.arrive, and begins fetching more data from memory.\n * Once the data returns from memory, the producer must wait\n * until the consumer signals that it has read the value from\n * the shared memory location. In the meantime, a consumer\n * thread waits until the data is stored by the producer, reads\n * it, and then signals that it is done (without waiting).\n */\n    // Producer code places produced value in shared memory.\n    st.shared   [r0],r1;\n    bar.arrive  0,64;\n    ld.global   r1,[r2];\n    bar.sync    1,64;\n    ...\n\n    // Consumer code, reads value from shared memory\n    bar.sync   0,64;\n    ld.shared  r1,[r0];\n    bar.arrive 1,64;\n    ...\n\n    // Examples of barrier.cta.sync\n    st.shared         [r0],r1;\n    barrier.cta.sync  0;\n    ld.shared         r1, [r0];\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: barrier.cluster</h1><section id=\"parallel-synchronization-and-communication-instructions-barrier-cluster\">\n\n\n<p>Barrier synchronization within a cluster.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>barrier.cluster.arrive{.sem}{.aligned};\nbarrier.cluster.wait{.acquire}{.aligned};\n\n.sem = {.release, .relaxed}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs barrier synchronization and communication within a cluster.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> instructions can be used by the threads within the cluster for synchronization\nand communication.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> instruction marks warps\u2019 arrival at barrier without causing executing\nthread to wait for threads of other participating warps.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code> instruction causes the executing thread to wait for all non-exited threads\nof the cluster to perform <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code>.</p>\n<p>In addition, <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> instructions cause the executing thread to wait for all non-exited\nthreads from its warp.</p>\n<p>When all non-exited threads that executed <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> have executed\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code>, the barrier completes and is reinitialized so it can be reused\nimmediately. Each thread must arrive at the barrier only once before the barrier completes.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code> instruction guarantees that when it completes the execution, memory\naccesses (except asynchronous operations) requested, in program order, prior to the preceding\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> by all threads in the cluster are complete and visible to the executing\nthread.</p>\n<p>There is no memory ordering and visibility guarantee for memory accesses requested by the executing\nthread, in program order, after <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> and prior to <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> qualifier on <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> specifies that there are no memory\nordering and visibility guarantees provided for the memory accesses performed prior to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> qualifiers on instructions <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code> specify the memory synchronization as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency\nModel</a>. If the optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.arrive</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> is assumed by default. If the optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>\nqualifier is absent for <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster.wait</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> qualifier indicates that all threads in the warp must execute the same\n<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code> instruction. In conditionally executed code, an aligned <code class=\"docutils literal notranslate\"><span class=\"pre\">barrier.cluster</span></code>\ninstruction should only be used if it is known that all threads in the warp evaluate the condition\nidentically, otherwise behavior is undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> qualifiers introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// use of arrive followed by wait\nld.shared::cluster.u32 r0, [addr];\nbarrier.cluster.arrive.aligned;\n...\nbarrier.cluster.wait.aligned;\nst.shared::cluster.u32 [addr], r1;\n\n// use memory fence prior to arrive for relaxed barrier\n@cta0 ld.shared::cluster.u32 r0, [addr];\nfence.cluster.acq_rel;\nbarrier.cluster.arrive.relaxed.aligned;\n...\nbarrier.cluster.wait.aligned;\n@cta1 st.shared::cluster.u32 [addr], r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: bar, barrier\n\n\n\nBarrier synchronization.\n\nSyntax\n\nbarrier{.cta}.sync{.aligned}      a{, b};\n\nbarrier{.cta}.arrive{.aligned}    a, b;\n\nbarrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;\n\nbarrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;\n\nbar{.cta}.sync      a{, b};\n\nbar{.cta}.arrive    a, b;\n\nbar{.cta}.red.popc.u32  d, a{, b}, {!}c;\n\nbar{.cta}.red.op.pred   p, a{, b}, {!}c;\n\n.op = { .and, .or };\n\nDescription\n\nPerform...\n\n=====Parallel Synchronization and Communication Instructions: barrier.cluster\n\n\n\nBarrier synchronization within a cluster.\n\nSyntax\n\nbarrier.cluster.arrive{.sem}{.aligned};\n\nbarrier.cluster.wait{.acquire}{.aligned};\n\n.sem = {.release, .relaxed}\n\nDescription\n\nPerforms barrier synchronization and communication within a cluster.\n\nbarrier.cluster instructions can be used by the threads within the cluster for synchronization\n\nand communication.\n\nbarrier.cluster.arrive instruction marks warps... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier"
            };

        case "bfe":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe\" target=\"_blank\" rel=\"noopener noreferrer\">bfe(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bfe</h1><section id=\"integer-arithmetic-instructions-bfe\">\n\n\n<p>Bit Field Extract.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bfe.type  d, a, b, c;\n\n.type = { .u32, .u64,\n          .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Extract bit field from <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and place the zero or sign-extended result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Source <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> gives\nthe bit field starting bit position, and source <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> gives the bit field length in bits.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> have the same type as the instruction type. Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, but are restricted to the 8-bit value range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..255</span></code>.</p>\n<p>The sign bit of the extracted field is defined as:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>:</dt><dd><p>zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of input a if the extracted field extends beyond the <code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of a <code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of extracted\nfield, otherwise</p>\n</dd>\n</dl>\n<p>If the bit field length is zero, the result is zero.</p>\n<p>The destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is padded with the sign bit of the extracted field. If the start position is\nbeyond the <code class=\"docutils literal notranslate\"><span class=\"pre\">msb</span></code> of the input, the destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is filled with the replicated sign bit of the\nextracted field.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>msb = (.type==.u32 || .type==.s32) ? 31 : 63;\npos = b &amp; 0xff;  // pos restricted to 0..255 range\nlen = c &amp; 0xff;  // len restricted to 0..255 range\n\nif (.type==.u32 || .type==.u64 || len==0)\n    sbit = 0;\nelse\n    sbit = a[min(pos+len-1,msb)];\n\nd = 0;\nfor (i=0; i&lt;=msb; i++) {\n    d[i] = (i&lt;len &amp;&amp; pos+i&lt;=msb) ? a[pos+i] : sbit;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfe</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bfe.b32  d,a,start,len;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit Field Extract.\n\nSyntax\n\nbfe.type  d, a, b, c;\n\n.type = { .u32, .u64,\n\n          .s32, .s64 };\n\nDescription\n\nExtract bit field from a and place the zero or sign-extended result in d. Source b gives\n\nthe bit field starting bit position, and source c gives the bit field length in bits.\n\nOperands a and d have the same type as the instruction type. Operands b and c are\n\ntype .u32, but are restricted to the 8-bit value range 0..255.\n\nThe sign bit of the extracted field is defined as:\n\n.u32, .u64:zero\n\n.s32, .s64:msb of input a if the extracted field extends beyond the msb of a msb of extracted\n\nfield, otherwise\n\nIf the bit field length is zero, the result is zero.\n\nThe destination d is padded with the sign bit of the extracted field. If the start position is\n\nbeyond the msb of the input, the destination d is filled with the replicated sign bit of the\n\nextracted field.\n\nSemantics\n\nmsb = (.type==.u32 || .type==.s32) ? 31 : 63;\n\npos = b & 0xff;  // pos restricted to 0..255 range\n\nlen = c & 0xff;  // len restricted to 0..255 range\n\nif (.type==.u32 || .type==.u64 || len==0)\n\n    sbit = 0;\n\nelse\n\n    sbit = a[min(pos+len-1,msb)];\n\nd = 0;\n\nfor (i=0; i<=msb; i++) {\n\n    d[i] = (i<len && pos+i<=msb) ? a[pos+i] : sbit;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbfe requires sm_20 or higher.\n\nExamples\n\nbfe.b32  d,a,start,len;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe"
            };

        case "bfi":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi\" target=\"_blank\" rel=\"noopener noreferrer\">bfi(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bfi</h1><section id=\"integer-arithmetic-instructions-bfi\">\n\n\n<p>Bit Field Insert.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bfi.type  f, a, b, c, d;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Align and insert a bit field from <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> into <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and place the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code>. Source <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\ngives the starting bit position for the insertion, and source <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> gives the bit field length in\nbits.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> have the same type as the instruction type. Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, but are restricted to the 8-bit value range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..255</span></code>.</p>\n<p>If the bit field length is zero, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>If the start position is beyond the msb of the input, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>msb = (.type==.b32) ? 31 : 63;\npos = c &amp; 0xff;  // pos restricted to 0..255 range\nlen = d &amp; 0xff;  // len restricted to 0..255 range\n\nf = b;\nfor (i=0; i&lt;len &amp;&amp; pos+i&lt;=msb; i++) {\n    f[pos+i] = a[i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfi</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bfi.b32  d,a,b,start,len;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit Field Insert.\n\nSyntax\n\nbfi.type  f, a, b, c, d;\n\n.type = { .b32, .b64 };\n\nDescription\n\nAlign and insert a bit field from a into b, and place the result in f. Source c\n\ngives the starting bit position for the insertion, and source d gives the bit field length in\n\nbits.\n\nOperands a, b, and f have the same type as the instruction type. Operands c and\n\nd are type .u32, but are restricted to the 8-bit value range 0..255.\n\nIf the bit field length is zero, the result is b.\n\nIf the start position is beyond the msb of the input, the result is b.\n\nSemantics\n\nmsb = (.type==.b32) ? 31 : 63;\n\npos = c & 0xff;  // pos restricted to 0..255 range\n\nlen = d & 0xff;  // len restricted to 0..255 range\n\nf = b;\n\nfor (i=0; i<len && pos+i<=msb; i++) {\n\n    f[pos+i] = a[i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbfi requires sm_20 or higher.\n\nExamples\n\nbfi.b32  d,a,b,start,len;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi"
            };

        case "bfind":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind\" target=\"_blank\" rel=\"noopener noreferrer\">bfind(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bfind</h1><section id=\"integer-arithmetic-instructions-bfind\">\n\n\n<p>Find most significant non-sign bit.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bfind.type           d, a;\nbfind.shiftamt.type  d, a;\n\n.type = { .u32, .u64,\n          .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Find the bit position of the most significant non-sign bit in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and place the result in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the instruction type, and destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>. For unsigned\nintegers, <code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns the bit position of the most significant <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code>. For signed integers,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns the bit position of the most significant <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> for negative inputs and the most\nsignificant <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> for non-negative inputs.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.shiftamt</span></code> is specified, <code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns the shift amount needed to left-shift the found bit\ninto the most-significant bit position.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">0xffffffff</span></code> if no non-sign bit is found.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>msb = (.type==.u32 || .type==.s32) ? 31 : 63;\n// negate negative signed inputs\nif ( (.type==.s32 || .type==.s64) &amp;&amp; (a &amp; (1&lt;&lt;msb)) ) {\n    a = ~a;\n}\n.u32  d = 0xffffffff;\nfor (.s32 i=msb; i&gt;=0; i--) {\n    if (a &amp; (1&lt;&lt;i))  { d = i; break; }\n}\nif (.shiftamt &amp;&amp; d != 0xffffffff)  { d = msb - d; }\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bfind</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bfind.u32  d, a;\nbfind.shiftamt.s64  cnt, X;  // cnt is .u32\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find most significant non-sign bit.\n\nSyntax\n\nbfind.type           d, a;\n\nbfind.shiftamt.type  d, a;\n\n.type = { .u32, .u64,\n\n          .s32, .s64 };\n\nDescription\n\nFind the bit position of the most significant non-sign bit in a and place the result in\n\nd. Operand a has the instruction type, and destination d has type .u32. For unsigned\n\nintegers, bfind returns the bit position of the most significant 1. For signed integers,\n\nbfind returns the bit position of the most significant 0 for negative inputs and the most\n\nsignificant 1 for non-negative inputs.\n\nIf .shiftamt is specified, bfind returns the shift amount needed to left-shift the found bit\n\ninto the most-significant bit position.\n\nbfind returns 0xffffffff if no non-sign bit is found.\n\nSemantics\n\nmsb = (.type==.u32 || .type==.s32) ? 31 : 63;\n\n// negate negative signed inputs\n\nif ( (.type==.s32 || .type==.s64) && (a & (1<<msb)) ) {\n\n    a = ~a;\n\n}\n\n.u32  d = 0xffffffff;\n\nfor (.s32 i=msb; i>=0; i--) {\n\n    if (a & (1<<i))  { d = i; break; }\n\n}\n\nif (.shiftamt && d != 0xffffffff)  { d = msb - d; }\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbfind requires sm_20 or higher.\n\nExamples\n\nbfind.u32  d, a;\n\nbfind.shiftamt.s64  cnt, X;  // cnt is .u32\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind"
            };

        case "bmsk":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk\" target=\"_blank\" rel=\"noopener noreferrer\">bmsk(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: bmsk</h1><section id=\"integer-arithmetic-instructions-bmsk\">\n\n\n<p>Bit Field Mask.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bmsk.mode.b32  d, a, b;\n\n.mode = { .clamp, .wrap };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Generates a 32-bit mask starting from the bit position specified in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and of the width\nspecified in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. The generated bitmask is stored in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>The resulting bitmask is 0 in the following cases:</p>\n<ul class=\"simple\">\n<li><p>When the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is 32 or higher and <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code>.</p></li>\n<li><p>When either the specified value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> or the wrapped value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> (when <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> is\nspecified as <code class=\"docutils literal notranslate\"><span class=\"pre\">.wrap</span></code>) is 0.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>a1    = a &amp; 0x1f;\nmask0 = (~0) &lt;&lt; a1;\nb1    = b &amp; 0x1f;\nsum   = a1 + b1;\nmask1 = (~0) &lt;&lt; sum;\n\nsum-overflow          = sum &gt;= 32 ? true : false;\nbit-position-overflow = false;\nbit-width-overflow    = false;\n\nif (.mode == .clamp) {\n    if (a &gt;= 32) {\n        bit-position-overflow = true;\n        mask0 = 0;\n    }\n    if (b &gt;= 32) {\n        bit-width-overflow = true;\n    }\n}\n\nif (sum-overflow || bit-position-overflow || bit-width-overflow) {\n    mask1 = 0;\n} else if (b1 == 0) {\n    mask1 = ~0;\n}\nd = mask0 &amp; ~mask1;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The bitmask width specified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is limited to range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..32</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code> mode and to\nrange <code class=\"docutils literal notranslate\"><span class=\"pre\">0..31</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">.wrap</span></code> mode.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bmsk</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bmsk.clamp.b32  rd, ra, rb;\nbmsk.wrap.b32   rd, 1, 2; // Creates a bitmask of 0x00000006.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit Field Mask.\n\nSyntax\n\nbmsk.mode.b32  d, a, b;\n\n.mode = { .clamp, .wrap };\n\nDescription\n\nGenerates a 32-bit mask starting from the bit position specified in operand a, and of the width\n\nspecified in operand b. The generated bitmask is stored in the destination operand d.\n\nThe resulting bitmask is 0 in the following cases:\n\nWhen the value of a is 32 or higher and .mode is .clamp.\n\nWhen either the specified value of b or the wrapped value of b (when .mode is\n\nspecified as .wrap) is 0.\n\nSemantics\n\na1    = a & 0x1f;\n\nmask0 = (~0) << a1;\n\nb1    = b & 0x1f;\n\nsum   = a1 + b1;\n\nmask1 = (~0) << sum;\n\nsum-overflow          = sum >= 32 ? true : false;\n\nbit-position-overflow = false;\n\nbit-width-overflow    = false;\n\nif (.mode == .clamp) {\n\n    if (a >= 32) {\n\n        bit-position-overflow = true;\n\n        mask0 = 0;\n\n    }\n\n    if (b >= 32) {\n\n        bit-width-overflow = true;\n\n    }\n\n}\n\nif (sum-overflow || bit-position-overflow || bit-width-overflow) {\n\n    mask1 = 0;\n\n} else if (b1 == 0) {\n\n    mask1 = ~0;\n\n}\n\nd = mask0 & ~mask1;\n\nNotes\n\nThe bitmask width specified by operand b is limited to range 0..32 in .clamp mode and to\n\nrange 0..31 in .wrap mode.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nbmsk requires sm_70 or higher.\n\nExamples\n\nbmsk.clamp.b32  rd, ra, rb;\n\nbmsk.wrap.b32   rd, 1, 2; // Creates a bitmask of 0x00000006.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bmsk"
            };

        case "bra":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra\" target=\"_blank\" rel=\"noopener noreferrer\">bra <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: bra</h1><section id=\"control-flow-instructions-bra\">\n\n\n<p>Branch to a target and continue execution there.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p   bra{.uni}  tgt;           // tgt is a label\n     bra{.uni}  tgt;           // unconditional branch\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Continue execution at the target. Conditional branches are specified by using a guard predicate. The\nbranch target must be a label.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">bra.uni</span></code> is guaranteed to be non-divergent, i.e. all active threads in a warp that are currently\nexecuting this instruction have identical values for the guard predicate and branch target.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (p) {\n    pc = tgt;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Unimplemented indirect branch introduced in PTX ISA version 2.1 has been removed from the spec.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>bra.uni  L_exit;    // uniform unconditional jump\n@q  bra      L23;   // conditional branch\n</pre></div>\n</div>\n</section>",
                "tooltip": "Branch to a target and continue execution there.\n\nSyntax\n\n@p   bra{.uni}  tgt;           // tgt is a label\n\n     bra{.uni}  tgt;           // unconditional branch\n\nDescription\n\nContinue execution at the target. Conditional branches are specified by using a guard predicate. The\n\nbranch target must be a label.\n\nbra.uni is guaranteed to be non-divergent, i.e. all active threads in a warp that are currently\n\nexecuting this instruction have identical values for the guard predicate and branch target.\n\nSemantics\n\nif (p) {\n\n    pc = tgt;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nUnimplemented indirect branch introduced in PTX ISA version 2.1 has been removed from the spec.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nbra.uni  L_exit;    // uniform unconditional jump\n\n@q  bra      L23;   // conditional branch\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-bra"
            };

        case "branchtargets":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-branchtargets\" target=\"_blank\" rel=\"noopener noreferrer\">branchtargets <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Directives: .branchtargets</h1><section id=\"control-flow-directives-branchtargets\">\n\n\n<p>Declare a list of potential branch targets.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>Label:   .branchtargets  list-of-labels ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares a list of potential branch targets for a subsequent <code class=\"docutils literal notranslate\"><span class=\"pre\">brx.idx</span></code>, and associates the list\nwith the label at the start of the line.</p>\n<p>All control flow labels in the list must occur within the same function as the declaration.</p>\n<p>The list of labels may use the compact, shorthand syntax for enumerating a range of labels having a\ncommon prefix, similar to the syntax described in <a class=\"reference external\" href=\"#parameterized-variable-names\">Parameterized Variable Names</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>  .function foo () {\n      .reg .u32 %r0;\n      ...\n      L1:\n      ...\n      L2:\n      ...\n      L3:\n      ...\n      ts: .branchtargets L1, L2, L3;\n      @p brx.idx %r0, ts;\n      ...\n\n.function bar() {\n      .reg .u32 %r0;\n      ...\n      N0:\n      ...\n      N1:\n      ...\n      N2:\n      ...\n      N3:\n      ...\n      N4:\n      ...\n      ts: .branchtargets N&lt;5&gt;;\n      @p brx.idx %r0, ts;\n      ...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare a list of potential branch targets.\n\nSyntax\n\nLabel:   .branchtargets  list-of-labels ;\n\nDescription\n\nDeclares a list of potential branch targets for a subsequent brx.idx, and associates the list\n\nwith the label at the start of the line.\n\nAll control flow labels in the list must occur within the same function as the declaration.\n\nThe list of labels may use the compact, shorthand syntax for enumerating a range of labels having a\n\ncommon prefix, similar to the syntax described in Parameterized Variable Names.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\n  .function foo () {\n\n      .reg .u32 %r0;\n\n      ...\n\n      L1:\n\n      ...\n\n      L2:\n\n      ...\n\n      L3:\n\n      ...\n\n      ts: .branchtargets L1, L2, L3;\n\n      @p brx.idx %r0, ts;\n\n      ...\n\n.function bar() {\n\n      .reg .u32 %r0;\n\n      ...\n\n      N0:\n\n      ...\n\n      N1:\n\n      ...\n\n      N2:\n\n      ...\n\n      N3:\n\n      ...\n\n      N4:\n\n      ...\n\n      ts: .branchtargets N<5>;\n\n      @p brx.idx %r0, ts;\n\n      ...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-branchtargets"
            };

        case "brev":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev\" target=\"_blank\" rel=\"noopener noreferrer\">brev(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: brev</h1><section id=\"integer-arithmetic-instructions-brev\">\n\n\n<p>Bit reverse.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>brev.type  d, a;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform bitwise reversal of input.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>msb = (.type==.b32) ? 31 : 63;\n\nfor (i=0; i&lt;=msb; i++) {\n    d[i] = a[msb-i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">brev</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>brev.b32  d, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bit reverse.\n\nSyntax\n\nbrev.type  d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nPerform bitwise reversal of input.\n\nSemantics\n\nmsb = (.type==.b32) ? 31 : 63;\n\nfor (i=0; i<=msb; i++) {\n\n    d[i] = a[msb-i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nbrev requires sm_20 or higher.\n\nExamples\n\nbrev.b32  d, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev"
            };

        case "brkpt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt\" target=\"_blank\" rel=\"noopener noreferrer\">brkpt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: brkpt</h1><section id=\"miscellaneous-instructions-brkpt\">\n\n\n<p>Breakpoint.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>brkpt;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Suspends execution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">brkpt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    brkpt;\n@p  brkpt;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Breakpoint.\n\nSyntax\n\nbrkpt;\n\nDescription\n\nSuspends execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nbrkpt requires sm_11 or higher.\n\nExamples\n\n    brkpt;\n\n@p  brkpt;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-brkpt"
            };

        case "brx":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx\" target=\"_blank\" rel=\"noopener noreferrer\">brx.idx <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: brx.idx</h1><section id=\"control-flow-instructions-brx-idx\">\n\n\n<p>Branch to a label indexed from a list of potential branch targets.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p    brx.idx{.uni} index, tlist;\n      brx.idx{.uni} index, tlist;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Index into a list of possible destination labels, and continue execution from the chosen\nlabel. Conditional branches are specified by using a guard predicate.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">brx.idx.uni</span></code> guarantees that the branch is non-divergent, i.e. all active threads in a warp that\nare currently executing this instruction have identical values for the guard predicate and the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code> argument.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code> operand is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> register. The <code class=\"docutils literal notranslate\"><span class=\"pre\">tlist</span></code> operand must be the label of a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.branchtargets</span></code> directive. It is accessed as a zero-based sequence using <code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code>. Behaviour is\nundefined if the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">index</span></code> is greater than or equal to the length of <code class=\"docutils literal notranslate\"><span class=\"pre\">tlist</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.branchtargets</span></code> directive must be defined in the local function scope before it is used. It\nmust refer to labels within the current function.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (p) {\n    if (index &lt; length(tlist)) {\n      pc = tlist[index];\n    } else {\n      pc = undefined;\n    }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.function foo () {\n    .reg .u32 %r0;\n    ...\n    L1:\n    ...\n    L2:\n    ...\n    L3:\n    ...\n    ts: .branchtargets L1, L2, L3;\n    @p brx.idx %r0, ts;\n    ...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Branch to a label indexed from a list of potential branch targets.\n\nSyntax\n\n@p    brx.idx{.uni} index, tlist;\n\n      brx.idx{.uni} index, tlist;\n\nDescription\n\nIndex into a list of possible destination labels, and continue execution from the chosen\n\nlabel. Conditional branches are specified by using a guard predicate.\n\nbrx.idx.uni guarantees that the branch is non-divergent, i.e. all active threads in a warp that\n\nare currently executing this instruction have identical values for the guard predicate and the\n\nindex argument.\n\nThe index operand is a .u32 register. The tlist operand must be the label of a\n\n.branchtargets directive. It is accessed as a zero-based sequence using index. Behaviour is\n\nundefined if the value of index is greater than or equal to the length of tlist.\n\nThe .branchtargets directive must be defined in the local function scope before it is used. It\n\nmust refer to labels within the current function.\n\nSemantics\n\nif (p) {\n\n    if (index < length(tlist)) {\n\n      pc = tlist[index];\n\n    } else {\n\n      pc = undefined;\n\n    }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\n.function foo () {\n\n    .reg .u32 %r0;\n\n    ...\n\n    L1:\n\n    ...\n\n    L2:\n\n    ...\n\n    L3:\n\n    ...\n\n    ts: .branchtargets L1, L2, L3;\n\n    @p brx.idx %r0, ts;\n\n    ...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-brx-idx"
            };

        case "call":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call\" target=\"_blank\" rel=\"noopener noreferrer\">call <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: call</h1><section id=\"control-flow-instructions-call\">\n\n\n<p>Call a function, recording the return location.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// direct call to named function, func is a symbol\ncall{.uni} (ret-param), func, (param-list);\ncall{.uni} func, (param-list);\ncall{.uni} func;\n\n// indirect call via pointer, with full list of call targets\ncall{.uni} (ret-param), fptr, (param-list), flist;\ncall{.uni} fptr, (param-list), flist;\ncall{.uni} fptr, flist;\n\n// indirect call via pointer, with no knowledge of call targets\ncall{.uni} (ret-param), fptr, (param-list), fproto;\ncall{.uni} fptr, (param-list), fproto;\ncall{.uni} fptr, fproto;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> instruction stores the address of the next instruction, so execution can resume at that\npoint after executing a <code class=\"docutils literal notranslate\"><span class=\"pre\">ret</span></code> instruction. A <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> is assumed to be divergent unless the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code> suffix is present. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code> suffix indicates that the <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> is guaranteed to be\nnon-divergent, i.e. all active threads in a warp that are currently executing this instruction have\nidentical values for the guard predicate and <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> target.</p>\n<p>For direct calls, the called location <code class=\"docutils literal notranslate\"><span class=\"pre\">func</span></code> must be a symbolic function name; for indirect calls,\nthe called location <code class=\"docutils literal notranslate\"><span class=\"pre\">fptr</span></code> must be an address of a function held in a register. Input arguments\nand return values are optional.\u00a0Arguments may be registers, immediate constants, or variables in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space. Arguments are pass-by-value.</p>\n<p>Indirect calls require an additional operand, <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">fproto</span></code>, to communicate the list of\npotential <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets or the common function prototype of all <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets,\nrespectively. In the first case, <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code> gives a complete list of potential <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets and\nthe optimizing backend is free to optimize the calling convention. In the second case, where the\ncomplete list of potential <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets may not be known, the common function prototype is given\nand the <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> must obey the ABI\u2019s calling convention.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code> operand is either the name of an array (call table) initialized to a list of function\nnames; or a label associated with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> directive, which declares a list of potential\n<code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> targets. In both cases the fptr register holds the address of a function listed in the call\ntable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> list, and the <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> operands are type-checked against the type\nsignature of the functions indicated by <code class=\"docutils literal notranslate\"><span class=\"pre\">flist</span></code>.</p>\n<p>The fproto operand is the name of a label associated with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.callprototype</span></code> directive. This\noperand is used when a complete list of potential targets is not known. The <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> operands are\ntype-checked against the prototype, and code generation will follow the ABI calling convention. If a\nfunction that doesn\u2019t match the prototype is called, the behavior is undefined.</p>\n<p>Call tables may be declared at module scope or local scope, in either the constant or global state\nspace. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.callprototype</span></code> directives must be declared within a function\nbody. All functions must be declared prior to being referenced in a <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> table initializer or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> directive.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Direct <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> introduced in PTX ISA version 1.0. Indirect <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Direct <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> supported on all target architectures. Indirect <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// examples of direct call\n    call     init;    // call function 'init'\n    call.uni g, (a);  // call function 'g' with parameter 'a'\n@p  call     (d), h, (a, b);  // return value into register d\n\n// call-via-pointer using jump table\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n.global .u32 jmptbl[5] = { foo, bar, baz };\n      ...\n@p    ld.global.u32  %r0, [jmptbl+4];\n@p    ld.global.u32  %r0, [jmptbl+8];\n      call  (retval), %r0, (x, y), jmptbl;\n\n// call-via-pointer using .calltargets directive\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n      ...\n@p    mov.u32  %r0, foo;\n@q    mov.u32  %r0, baz;\nFtgt: .calltargets foo, bar, baz;\n      call  (retval), %r0, (x, y), Ftgt;\n\n// call-via-pointer using .callprototype directive\n.func dispatch (.reg .u32 fptr, .reg .u32 idx)\n{\n...\nFproto: .callprototype _ (.param .u32 _, .param .u32 _);\n      call  %fptr, (x, y), Fproto;\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Call a function, recording the return location.\n\nSyntax\n\n// direct call to named function, func is a symbol\n\ncall{.uni} (ret-param), func, (param-list);\n\ncall{.uni} func, (param-list);\n\ncall{.uni} func;\n\n// indirect call via pointer, with full list of call targets\n\ncall{.uni} (ret-param), fptr, (param-list), flist;\n\ncall{.uni} fptr, (param-list), flist;\n\ncall{.uni} fptr, flist;\n\n// indirect call via pointer, with no knowledge of call targets\n\ncall{.uni} (ret-param), fptr, (param-list), fproto;\n\ncall{.uni} fptr, (param-list), fproto;\n\ncall{.uni} fptr, fproto;\n\nDescription\n\nThe call instruction stores the address of the next instruction, so execution can resume at that\n\npoint after executing a ret instruction. A call is assumed to be divergent unless the\n\n.uni suffix is present. The .uni suffix indicates that the call is guaranteed to be\n\nnon-divergent, i.e. all active threads in a warp that are currently executing this instruction have\n\nidentical values for the guard predicate and call target.\n\nFor direct calls, the called location func must be a symbolic function name; for indirect calls,\n\nthe called location fptr must be an address of a function held in a register. Input arguments\n\nand return values are optional.\u00a0Arguments may be registers, immediate constants, or variables in\n\n.param space. Arguments are pass-by-value.\n\nIndirect calls require an additional operand, flist or fproto, to communicate the list of\n\npotential call targets or the common function prototype of all call targets,\n\nrespectively. In the first case, flist gives a complete list of potential call targets and\n\nthe optimizing backend is free to optimize the calling convention. In the second case, where the\n\ncomplete list of potential call targets may not be known, the common function prototype is given\n\nand the call must obey the ABI\u2019s calling convention.\n\nThe flist operand is either the name of an array (call table) initialized to a list of function\n\nnames; or a label associated with a .calltargets directive, which declares a list of potential\n\ncall targets. In both cases the fptr register holds the address of a function listed in the call\n\ntable or .calltargets list, and the call operands are type-checked against the type\n\nsignature of the functions indicated by flist.\n\nThe fproto operand is the name of a label associated with a .callprototype directive. This\n\noperand is used when a complete list of potential targets is not known. The call operands are\n\ntype-checked against the prototype, and code generation will follow the ABI calling convention. If a\n\nfunction that doesn\u2019t match the prototype is called, the behavior is undefined.\n\nCall tables may be declared at module scope or local scope, in either the constant or global state\n\nspace. The .calltargets and .callprototype directives must be declared within a function\n\nbody. All functions must be declared prior to being referenced in a call table initializer or\n\n.calltargets directive.\n\nPTX ISA Notes\n\nDirect call introduced in PTX ISA version 1.0. Indirect call introduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nDirect call supported on all target architectures. Indirect call requires sm_20 or higher.\n\nExamples\n\n// examples of direct call\n\n    call     init;    // call function 'init'\n\n    call.uni g, (a);  // call function 'g' with parameter 'a'\n\n@p  call     (d), h, (a, b);  // return value into register d\n\n// call-via-pointer using jump table\n\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n.global .u32 jmptbl[5] = { foo, bar, baz };\n\n      ...\n\n@p    ld.global.u32  %r0, [jmptbl+4];\n\n@p    ld.global.u32  %r0, [jmptbl+8];\n\n      call  (retval), %r0, (x, y), jmptbl;\n\n// call-via-pointer using .calltargets directive\n\n.func (.reg .u32 rv) foo (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) bar (.reg .u32 a, .reg .u32 b) ...\n\n.func (.reg .u32 rv) baz (.reg .u32 a, .reg .u32 b) ...\n\n      ...\n\n@p    mov.u32  %r0, foo;\n\n@q    mov.u32  %r0, baz;\n\nFtgt: ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-call"
            };

        case "callprototype":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-callprototype\" target=\"_blank\" rel=\"noopener noreferrer\">callprototype <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Directives: .callprototype</h1><section id=\"control-flow-directives-callprototype\">\n\n\n<p>Declare a prototype for use in an indirect call.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span> // no input or return parameters\nlabel: .callprototype _ .noreturn;\n// input params, no return params\nlabel: .callprototype _ (param-list) .noreturn;\n// no input params, // return params\nlabel: .callprototype (ret-param) _ ;\n// input, return parameters\nlabel: .callprototype (ret-param) _ (param-list);\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Defines a prototype with no specific function name, and associates the prototype with a label. The\nprototype may then be used in indirect call instructions where there is incomplete knowledge of the\npossible call targets.</p>\n<p>Parameters may have either base types in the register or parameter state spaces, or array types in\nparameter state space. The sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> may be used to avoid dummy parameter names.</p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive indicates that the function does not return to the caller\nfunction. <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive cannot be specified on functions which have return parameters. See\nthe description of .noreturn directive in <a class=\"reference external\" href=\"#performance-tuning-directives-noreturn\">Performance-Tuning Directives: .noreturn</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive introduced in PTX ISA version 6.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>Fproto1: .callprototype  _ ;\nFproto2: .callprototype  _ (.param .f32 _);\nFproto3: .callprototype  (.param .u32 _) _ ;\nFproto4: .callprototype  (.param .u32 _) _ (.param .f32 _);\n...\n@p   call  (%val), %r0, (%f1), Fproto4;\n...\n\n// example of array parameter\nFproto5: .callprototype _ (.param .b8 _[12]);\n\nFproto6: .callprototype  _ (.param .f32 _) .noreturn;\n...\n@p   call  %r0, (%f1), Fproto6;\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare a prototype for use in an indirect call.\n\nSyntax\n\n // no input or return parameters\n\nlabel: .callprototype _ .noreturn;\n\n// input params, no return params\n\nlabel: .callprototype _ (param-list) .noreturn;\n\n// no input params, // return params\n\nlabel: .callprototype (ret-param) _ ;\n\n// input, return parameters\n\nlabel: .callprototype (ret-param) _ (param-list);\n\nDescription\n\nDefines a prototype with no specific function name, and associates the prototype with a label. The\n\nprototype may then be used in indirect call instructions where there is incomplete knowledge of the\n\npossible call targets.\n\nParameters may have either base types in the register or parameter state spaces, or array types in\n\nparameter state space. The sink symbol '_' may be used to avoid dummy parameter names.\n\nAn optional .noreturn directive indicates that the function does not return to the caller\n\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\n\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nSupport for .noreturn directive introduced in PTX ISA version 6.4.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\n.noreturn directive requires sm_30 or higher.\n\nExamples\n\nFproto1: .callprototype  _ ;\n\nFproto2: .callprototype  _ (.param .f32 _);\n\nFproto3: .callprototype  (.param .u32 _) _ ;\n\nFproto4: .callprototype  (.param .u32 _) _ (.param .f32 _);\n\n...\n\n@p   call  (%val), %r0, (%f1), Fproto4;\n\n...\n\n// example of array parameter\n\nFproto5: .callprototype _ (.param .b8 _[12]);\n\nFproto6: .callprototype  _ (.param .f32 _) .noreturn;\n\n...\n\n@p   call  %r0, (%f1), Fproto6;\n\n...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-callprototype"
            };

        case "calltargets":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-calltargets\" target=\"_blank\" rel=\"noopener noreferrer\">calltargets <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Directives: .calltargets</h1><section id=\"control-flow-directives-calltargets\">\n\n\n<p>Declare a list of potential call targets.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>Label:   .calltargets  list-of-functions ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares a list of potential call targets for a subsequent indirect call, and associates the list\nwith the label at the start of the line.</p>\n<p>All functions named in the list must be declared prior to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.calltargets</span></code> directive, and all\nfunctions must have the same type signature.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>calltgt:  .calltargets  fastsin, fastcos;\n...\n@p   call  (%f1), %r0, (%x), calltgt;\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare a list of potential call targets.\n\nSyntax\n\nLabel:   .calltargets  list-of-functions ;\n\nDescription\n\nDeclares a list of potential call targets for a subsequent indirect call, and associates the list\n\nwith the label at the start of the line.\n\nAll functions named in the list must be declared prior to the .calltargets directive, and all\n\nfunctions must have the same type signature.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ncalltgt:  .calltargets  fastsin, fastcos;\n\n...\n\n@p   call  (%f1), %r0, (%x), calltgt;\n\n...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-directives-calltargets"
            };

        case "clock":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi\" target=\"_blank\" rel=\"noopener noreferrer\">clock <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clock, %clock_hi</h1><section id=\"special-registers-clock-clock-hi\">\n<span id=\"special-registers-clock\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code></dt><dd><p>A predefined, read-only 32-bit unsigned cycle counter.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code></dt><dd><p>The upper 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> special register.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %clock;\n.sreg .u32 %clock_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special register <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> are unsigned 32-bit read-only cycle counters that wrap\nsilently.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32 r1,%clock;\nmov.u32 r2, %clock_hi;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%clockA predefined, read-only 32-bit unsigned cycle counter.\n\n%clock_hiThe upper 32-bits of %clock64 special register.\n\nSyntax (predefined)\n\n.sreg .u32 %clock;\n\n.sreg .u32 %clock_hi;\n\nDescription\n\nSpecial register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\n\nsilently.\n\nPTX ISA Notes\n\n%clock introduced in PTX ISA version 1.0.\n\n%clock_hi introduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n%clock supported on all target architectures.\n\n%clock_hi requires sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%clock;\n\nmov.u32 r2, %clock_hi;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi"
            };

        case "clock64":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock64\" target=\"_blank\" rel=\"noopener noreferrer\">clock64 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clock64</h1><section id=\"special-registers-clock64\">\n\n\n<p>A predefined, read-only 64-bit unsigned cycle counter.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %clock64;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special register <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> is an unsigned 64-bit read-only cycle counter that wraps silently.</p>\n<p><strong>Notes</strong></p>\n<p>The lower 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> are identical to <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code>.</p>\n<p>The upper 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> are identical to <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u64  r1,%clock64;\n</pre></div>\n</div>\n</section>",
                "tooltip": "A predefined, read-only 64-bit unsigned cycle counter.\n\nSyntax (predefined)\n\n.sreg .u64 %clock64;\n\nDescription\n\nSpecial register %clock64 is an unsigned 64-bit read-only cycle counter that wraps silently.\n\nNotes\n\nThe lower 32-bits of %clock64 are identical to %clock.\n\nThe upper 32-bits of %clock64 are identical to %clock_hi.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%clock64 requires sm_20 or higher.\n\nExamples\n\nmov.u64  r1,%clock64;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock64"
            };

        case "clock_hi":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi\" target=\"_blank\" rel=\"noopener noreferrer\">clock_hi <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clock, %clock_hi</h1><section id=\"special-registers-clock-clock-hi\">\n<span id=\"special-registers-clock\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code></dt><dd><p>A predefined, read-only 32-bit unsigned cycle counter.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code></dt><dd><p>The upper 32-bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock64</span></code> special register.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %clock;\n.sreg .u32 %clock_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special register <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> are unsigned 32-bit read-only cycle counters that wrap\nsilently.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%clock_hi</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32 r1,%clock;\nmov.u32 r2, %clock_hi;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%clockA predefined, read-only 32-bit unsigned cycle counter.\n\n%clock_hiThe upper 32-bits of %clock64 special register.\n\nSyntax (predefined)\n\n.sreg .u32 %clock;\n\n.sreg .u32 %clock_hi;\n\nDescription\n\nSpecial register %clock and %clock_hi are unsigned 32-bit read-only cycle counters that wrap\n\nsilently.\n\nPTX ISA Notes\n\n%clock introduced in PTX ISA version 1.0.\n\n%clock_hi introduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n%clock supported on all target architectures.\n\n%clock_hi requires sm_20 or higher.\n\nExamples\n\nmov.u32 r1,%clock;\n\nmov.u32 r2, %clock_hi;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clock-clock-hi"
            };

        case "cluster_ctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_ctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_ctaid</h1><section id=\"special-registers-cluster-ctaid\">\n\n\n<p>CTA identifier within a cluster.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %cluster_ctaid;\n.sreg .u32 %cluster_ctaid.x, %cluster_ctaid.y, %cluster_ctaid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the CTA identifier in a cluster in each\ndimension. Each CTA in a cluster has a unique CTA identifier.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%cluster_ctaid</span></code> special register contains a 1D, 2D, or 3D vector, depending upon the shape of\nthe cluster. The fourth element is unused and always returns zero.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>0  &lt;=  %cluster_ctaid.x &lt;  %cluster_nctaid.x\n0  &lt;=  %cluster_ctaid.y &lt;  %cluster_nctaid.y\n0  &lt;=  %cluster_ctaid.z &lt;  %cluster_nctaid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_ctaid.x;\nmov.u32     %r1, %cluster_ctaid.z;\nmov.v4.u32  %rx, %cluster_ctaid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "CTA identifier within a cluster.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %cluster_ctaid;\n\n.sreg .u32 %cluster_ctaid.x, %cluster_ctaid.y, %cluster_ctaid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the CTA identifier in a cluster in each\n\ndimension. Each CTA in a cluster has a unique CTA identifier.\n\nThe %cluster_ctaid special register contains a 1D, 2D, or 3D vector, depending upon the shape of\n\nthe cluster. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0  <=  %cluster_ctaid.x <  %cluster_nctaid.x\n\n0  <=  %cluster_ctaid.y <  %cluster_nctaid.y\n\n0  <=  %cluster_ctaid.z <  %cluster_nctaid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_ctaid.x;\n\nmov.u32     %r1, %cluster_ctaid.z;\n\nmov.v4.u32  %rx, %cluster_ctaid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctaid"
            };

        case "cluster_ctarank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_ctarank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_ctarank</h1><section id=\"special-registers-cluster-ctarank\">\n\n\n<p>CTA identifier in a cluster across all dimensions.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %cluster_ctarank;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the CTA rank within a cluster across all\ndimensions.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>0  &lt;=  %cluster_ctarank &lt;  %cluster_nctarank\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r;\n\nmov.u32  %r, %cluster_ctarank;\n</pre></div>\n</div>\n</section>",
                "tooltip": "CTA identifier in a cluster across all dimensions.\n\nSyntax (predefined)\n\n.sreg .u32 %cluster_ctarank;\n\nDescription\n\nA predefined, read-only special register initialized with the CTA rank within a cluster across all\n\ndimensions.\n\nIt is guaranteed that:\n\n0  <=  %cluster_ctarank <  %cluster_nctarank\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r;\n\nmov.u32  %r, %cluster_ctarank;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-ctarank"
            };

        case "cluster_nctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_nctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_nctaid</h1><section id=\"special-registers-cluster-nctaid\">\n\n\n<p>Number of CTA identifiers per cluster.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %cluster_nctaid;\n.sreg .u32 %cluster_nctaid.x, %cluster_nctaid.y, %cluster_nctaid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of CTAs in a cluster in each\ndimension.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%cluster_nctaid</span></code> special register contains a 3D grid shape vector that holds the cluster\ndimensions in terms of CTAs. The fourth element is unused and always returns zero.</p>\n<p>Refer to the <em>Cuda Programming Guide</em> for details on the maximum values of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%cluster_nctaid.{x,y,z}</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_nctaid.x;\nmov.u32     %r1, %cluster_nctaid.z;\nmov.v4.u32  %rx, %cluster_nctaid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of CTA identifiers per cluster.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %cluster_nctaid;\n\n.sreg .u32 %cluster_nctaid.x, %cluster_nctaid.y, %cluster_nctaid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the number of CTAs in a cluster in each\n\ndimension.\n\nThe %cluster_nctaid special register contains a 3D grid shape vector that holds the cluster\n\ndimensions in terms of CTAs. The fourth element is unused and always returns zero.\n\nRefer to the Cuda Programming Guide for details on the maximum values of\n\n%cluster_nctaid.{x,y,z}.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %cluster_nctaid.x;\n\nmov.u32     %r1, %cluster_nctaid.z;\n\nmov.v4.u32  %rx, %cluster_nctaid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctaid"
            };

        case "cluster_nctarank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank\" target=\"_blank\" rel=\"noopener noreferrer\">cluster_nctarank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %cluster_nctarank</h1><section id=\"special-registers-cluster-nctarank\">\n\n\n<p>Number of CTA identifiers in a cluster across all dimensions.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %cluster_nctarank;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the nunber of CTAs within a cluster across\nall dimensions.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r;\n\nmov.u32  %r, %cluster_nctarank;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of CTA identifiers in a cluster across all dimensions.\n\nSyntax (predefined)\n\n.sreg .u32 %cluster_nctarank;\n\nDescription\n\nA predefined, read-only special register initialized with the nunber of CTAs within a cluster across\n\nall dimensions.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r;\n\nmov.u32  %r, %cluster_nctarank;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-cluster-nctarank"
            };

        case "clusterid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid\" target=\"_blank\" rel=\"noopener noreferrer\">clusterid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %clusterid</h1><section id=\"special-registers-clusterid\">\n\n\n<p>Cluster identifier within a grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %clusterid;\n.sreg .u32 %clusterid.x, %clusterid.y, %clusterid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the cluster identifier in a grid in each\ndimension. Each cluster in a grid has a unique identifier.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%clusterid</span></code> special register contains a 1D, 2D, or 3D vector, depending upon the shape and\nrank of the cluster. The fourth element is unused and always returns zero.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>0  &lt;=  %clusterid.x &lt;  %nclusterid.x\n0  &lt;=  %clusterid.y &lt;  %nclusterid.y\n0  &lt;=  %clusterid.z &lt;  %nclusterid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %clusterid.x;\nmov.u32     %r1, %clusterid.z;\nmov.v4.u32  %rx, %clusterid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Cluster identifier within a grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %clusterid;\n\n.sreg .u32 %clusterid.x, %clusterid.y, %clusterid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the cluster identifier in a grid in each\n\ndimension. Each cluster in a grid has a unique identifier.\n\nThe %clusterid special register contains a 1D, 2D, or 3D vector, depending upon the shape and\n\nrank of the cluster. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0  <=  %clusterid.x <  %nclusterid.x\n\n0  <=  %clusterid.y <  %nclusterid.y\n\n0  <=  %clusterid.z <  %nclusterid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %clusterid.x;\n\nmov.u32     %r1, %clusterid.z;\n\nmov.v4.u32  %rx, %clusterid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-clusterid"
            };

        case "clz":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz\" target=\"_blank\" rel=\"noopener noreferrer\">clz(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: clz</h1><section id=\"integer-arithmetic-instructions-clz\">\n\n\n<p>Count leading zeros.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>clz.type  d, a;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Count the number of leading zeros in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> starting with the most-significant bit and place the\nresult in 32-bit destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.\u00a0Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the instruction type, and destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type, the number of leading zeros is between 0 and 32,\ninclusively. For<code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code> type, the number of leading zeros is between 0 and 64, inclusively.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.u32  d = 0;\nif (.type == .b32)   { max = 32; mask = 0x80000000; }\nelse                 { max = 64; mask = 0x8000000000000000; }\n\nwhile (d &lt; max &amp;&amp; (a&amp;mask == 0) ) {\n    d++;\n    a = a &lt;&lt; 1;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">clz</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>clz.b32  d, a;\nclz.b64  cnt, X;  // cnt is .u32\n</pre></div>\n</div>\n</section>",
                "tooltip": "Count leading zeros.\n\nSyntax\n\nclz.type  d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nCount the number of leading zeros in a starting with the most-significant bit and place the\n\nresult in 32-bit destination register d.\u00a0Operand a has the instruction type, and destination\n\nd has type .u32. For .b32 type, the number of leading zeros is between 0 and 32,\n\ninclusively. For.b64 type, the number of leading zeros is between 0 and 64, inclusively.\n\nSemantics\n\n.u32  d = 0;\n\nif (.type == .b32)   { max = 32; mask = 0x80000000; }\n\nelse                 { max = 64; mask = 0x8000000000000000; }\n\nwhile (d < max && (a&mask == 0) ) {\n\n    d++;\n\n    a = a << 1;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nclz requires sm_20 or higher.\n\nExamples\n\nclz.b32  d, a;\n\nclz.b64  cnt, X;  // cnt is .u32\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz"
            };

        case "cnot":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot\" target=\"_blank\" rel=\"noopener noreferrer\">cnot <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: cnot</h1><section id=\"logic-and-shift-instructions-cnot\">\n\n\n<p>C/C++ style logical negation.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cnot.type d, a;\n\n.type = { .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the logical negation using C/C++ semantics.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = (a==0) ? 1 : 0;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cnot.b32 d,a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "C/C++ style logical negation.\n\nSyntax\n\ncnot.type d, a;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nCompute the logical negation using C/C++ semantics.\n\nSemantics\n\nd = (a==0) ? 1 : 0;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ncnot.b32 d,a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-cnot"
            };

        case "common":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-common\" target=\"_blank\" rel=\"noopener noreferrer\">common <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Linking Directives: .common</h1><section id=\"linking-directives-common\">\n\n\n<p>Visible (externally) symbol declaration.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.common identifier\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares identifier to be globally visible but \u201ccommon\u201d.</p>\n<p>Common symbols are similar to globally visible symbols. However multiple object files may declare\nthe same common symbol and they may have different types and sizes and references to a symbol get\nresolved against a common symbol with the largest size.</p>\n<p>Only one object file can initialize a common symbol and that must have the largest size among all\nother definitions of that common symbol from different object files.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.common</span></code> linking directive can be used only on variables with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> storage. It cannot be\nused on function symbols or on symbols with opaque type.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.common</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.common .global .u32 gbl;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.common identifier\n\nDescription\n\nDeclares identifier to be globally visible but \u201ccommon\u201d.\n\nCommon symbols are similar to globally visible symbols. However multiple object files may declare\n\nthe same common symbol and they may have different types and sizes and references to a symbol get\n\nresolved against a common symbol with the largest size.\n\nOnly one object file can initialize a common symbol and that must have the largest size among all\n\nother definitions of that common symbol from different object files.\n\n.common linking directive can be used only on variables with .global storage. It cannot be\n\nused on function symbols or on symbols with opaque type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\n.common directive requires sm_20 or higher.\n\nExamples\n\n.common .global .u32 gbl;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-common"
            };

        case "copysign":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign\" target=\"_blank\" rel=\"noopener noreferrer\">copysign(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: copysign</h1><section id=\"floating-point-instructions-copysign\">\n\n\n<p>Copy sign of one input to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>copysign.type  d, a, b;\n\n.type = { .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Copy sign bit of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> into value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and return the result as <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>copysign.f32  x, y, z;\ncopysign.f64  A, B, C;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Copy sign of one input to another.\n\nSyntax\n\ncopysign.type  d, a, b;\n\n.type = { .f32, .f64 };\n\nDescription\n\nCopy sign bit of a into value of b, and return the result as d.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ncopysign.f32  x, y, z;\n\ncopysign.f64  A, B, C;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-copysign"
            };

        case "cos":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos\" target=\"_blank\" rel=\"noopener noreferrer\">cos(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: cos</h1><section id=\"floating-point-instructions-cos\">\n\n\n<p>Find the cosine of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cos.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Find the cosine of the angle <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> (in radians).</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = cos(a);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cos.approx.f32</span></code> implements a fast approximation to cosine.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-20.9</sup> in quadrant 00.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cos.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cos.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">cos.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">cos.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cos.approx.ftz.f32  ca, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the cosine of a value.\n\nSyntax\n\ncos.approx{.ftz}.f32  d, a;\n\nDescription\n\nFind the cosine of the angle a (in radians).\n\nSemantics\n\nd = cos(a);\n\nNotes\n\ncos.approx.f32 implements a fast approximation to cosine.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-subnormal\n\n+1.0\n\n-0.0\n\n+1.0\n\n+0.0\n\n+1.0\n\n+subnormal\n\n+1.0\n\n+Inf\n\nNaN\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-20.9 in quadrant 00.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\ncos.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\ncos.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, cos.f32 defaults to cos.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ncos.approx.ftz.f32  ca, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos"
            };

        case "cp":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.commit_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.prefetch <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.prefetch.tensor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.tensor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.bulk.wait_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.commit_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.mbarrier.arrive <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.wait_all <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\" target=\"_blank\" rel=\"noopener noreferrer\">cp.async.wait_group <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk\" target=\"_blank\" rel=\"noopener noreferrer\">cp.reduce.async.bulk <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor\" target=\"_blank\" rel=\"noopener noreferrer\">cp.reduce.async.bulk.tensor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: cp.async</h1><section id=\"data-movement-and-conversion-instructions-cp-async\">\n\n\n<p>Initiates an asynchronous copy operation from one state space to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], cp-size{, src-size}{, cache-policy} ;\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], 16{, src-size}{, cache-policy} ;\ncp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], cp-size{, ignore-src}{, cache-policy} ;\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n                         [dst], [src], 16{, ignore-src}{, cache-policy} ;\n\n.level::cache_hint =     { .L2::cache_hint }\n.level::prefetch_size =  { .L2::64B, .L2::128B, .L2::256B }\ncp-size =                { 4, 8, 16 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> is a non-blocking instruction which initiates an asynchronous copy operation of data\nfrom the location specified by source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> to the location specified by\ndestination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> specifies a location in the global state space\nand <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> specifies a location in the shared state space.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code> is an integer constant which specifies the size of data in bytes to be copied to\nthe destination <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code> can only be 4, 8 and 16.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> allows optionally specifying a 32-bit integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src-size</span></code>. Operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">src-size</span></code> represents the size of the data in bytes to be copied from <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> and must\nbe less than <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code>. In such case, remaining bytes in destination <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> are filled with\nzeros. Specifying <code class=\"docutils literal notranslate\"><span class=\"pre\">src-size</span></code> larger than <code class=\"docutils literal notranslate\"><span class=\"pre\">cp-size</span></code> results in undefined behavior.</p>\n<p>The optional and non-immediate predicate argument <code class=\"docutils literal notranslate\"><span class=\"pre\">ignore-src</span></code> specifies whether the data from the\nsource location <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> should be ignored completely. If the source data is ignored then zeros will\nbe copied to destination <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>. If the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">ignore-src</span></code> is not specified then it defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>Supported alignment requirements and addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code> are described\nin <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>The mandatory <code class=\"docutils literal notranslate\"><span class=\"pre\">.async</span></code> qualifier indicates that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp</span></code> instruction will initiate the memory\ncopy operation asynchronously and control will return to the executing thread before the copy\noperation is complete. The executing thread can then use <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier instructions</a> to wait for\ncompletion of the asynchronous copy operation. No other synchronization mechanisms described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a> can be used to guarantee the\ncompletion of the asynchronous copy operations.</p>\n<p>There is no ordering guarantee between two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations if they are not explicitly\nsynchronized using <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier instructions</a>.</p>\n<p>As described in <a class=\"reference external\" href=\"#cache-operators\">Cache Operators</a>, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cg</span></code> qualifier indicates\ncaching of data only at global level cache L2 and not at L1 whereas <code class=\"docutils literal notranslate\"><span class=\"pre\">.ca</span></code> qualifier indicates\ncaching of data at all levels including L1 cache. Cache operator are treated as performance hints\nonly.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> is treated as a weak memory operation in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is a hint to fetch additional data of the specified size\ninto the respective cache level.The sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch_size</span></code> can be set to either of <code class=\"docutils literal notranslate\"><span class=\"pre\">64B</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">128B</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">256B</span></code> thereby allowing the prefetch size to be 64 Bytes, 128 Bytes or 256 Bytes\nrespectively.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> may only be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and with\ngeneric addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space. If the generic address does\nnot fall within the address window of the global memory, then the prefetching behavior is undefined.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is treated as a performance hint only.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and for generic\naddressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifiers introduced in PTX ISA\nversion 7.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">ignore-src</span></code> operand introduced in PTX ISA version 7.5.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.ca.shared.global  [shrd],    [gbl + 4], 4;\ncp.async.ca.shared::cta.global  [%r0 + 8], [%r1],     8;\ncp.async.cg.shared.global  [%r2],     [%r3],     16;\n\ncp.async.cg.shared.global.L2::64B   [%r2],      [%r3],     16;\ncp.async.cg.shared.global.L2::128B  [%r0 + 16], [%r1],      8;\ncp.async.cg.shared.global.L2::256B  [%r2 + 32], [%r3],     16;\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache-policy, 0.25;\ncp.async.ca.shared.global.L2::cache_hint [%r2], [%r1], 4, cache-policy;\n\ncp.async.ca.shared.global                   [shrd], [gbl], 4, p;\ncp.async.cg.shared.global.L2::chache_hint   [%r0], [%r2], 16, q, cache-policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk\">\n\n\n<p>Initiates an asynchronous copy operation from one state space to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.dst.src.completion_mechanism{.multicast}{.level::cache_hint}\n                      [dstMem], [srcMem], size, [mbar] {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n.src =                  { .global }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.level::cache_hint =    { .L2::cache_hint }\n.multicast =            { .multicast::cluster  }\n\n\ncp.async.bulk.dst.src.completion_mechanism [dstMem], [srcMem], size, [mbar]\n\n.dst =                  { .shared::cluster }\n.src =                  { .shared::cta }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n\n\ncp.async.bulk.dst.src.completion_mechanism{.level::cache_hint} [dstMem], [srcMem], size{, cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.level::cache_hint =    { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> is a non-blocking instruction which initiates an asynchronous bulk-copy operation\nfrom the location specified by source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> to the location specified by\ndestination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code>.</p>\n<p>The direction of bulk-copy is from the state space specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> modifier to the state\nspace specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> modifiers.</p>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> specifies the amount of memory to be copied, in terms of number of\nbytes. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> must be a multiple of 16. If the value is not a multiple of 16, then the behavior is\nundefined. The memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[dstMem,</span> <span class=\"pre\">dstMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the destination memory\nspace and the memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[srcMem,</span> <span class=\"pre\">srcMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the source memory\nspace. Otherwise, the behavior is undefined. The addresses <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> must be aligned\nto 16 bytes.</p>\n<p>When the source of the copy is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> and the destination is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, the\ndestination has to be in the shared memory of a different CTA within the cluster.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Completion mechanism</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code></p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td rowspan=\"2\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::...</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td rowspan=\"2\"><p>mbarrier based completion mechanism</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n<td><p><em>Bulk async-group</em> based completion mechanism</p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::complete_tx::bytes</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> variant uses\nmbarrier based completion mechanism. The <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> variant uses <em>bulk async-group</em>\nbased completion mechanism.</p>\n<p>The optional modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> allows copying of data from global memory to shared\nmemory of multiple CTAs in the cluster. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> specifies the destination CTAs in the\ncluster such that each bit position in the 16-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> operand corresponds to the <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code>\nof the destination CTA. The source data is multicast to the same CTA-relative offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code>\nin the shared memory of each destination CTA. The mbarrier signal is also multicast to the same\nCTA-relative offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code> in the shared memory of the destination CTA.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>The copy operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> is treated as a weak memory operation and the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// .global -&gt; .shared::cluster:\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar];\n\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster\n                                             [dstMem], [srcMem], size, [mbar], ctaMask;\n\ncp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint\n                                             [dstMem], [srcMem], size, [mbar], cache-policy;\n\n\n// .shared::cta -&gt; .shared::cluster (strictly remote):\ncp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar];\n\n// .shared::cta -&gt; .global:\ncp.async.bulk.global.shared::cta.bulk_group [dstMem], [srcMem], size;\n\ncp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint} [dstMem], [srcMem], size, cache-policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.commit_group</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-commit-group\">\n\n\n<p>Commits all prior initiated but uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> instructions into a\n<em>cp.async.bulk-group</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.commit_group;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.commit_group</span></code> instruction creates a new per-thread <em>bulk async-group</em> and batches\nall prior <code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> instructions satisfying the following\nconditions into the new <em>bulk async-group</em>:</p>\n<ul class=\"simple\">\n<li><p>The prior <code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> instructions use <em>bulk_group</em> based\ncompletion mechanism, and</p></li>\n<li><p>They are initiated by the executing thread but not committed to any <em>bulk async-group</em>.</p></li>\n</ul>\n<p>If there are no uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> instructions then\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.commit_group</span></code> results in an empty <em>bulk async-group</em>.</p>\n<p>An executing thread can wait for the completion of all\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> operations in a <em>bulk async-group</em> using\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code>.</p>\n<p>There is no memory ordering guarantee provided between any two\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp{.reduce}.async.bulk.{.prefetch}{.tensor}</span></code> operations within the same <em>bulk async-group</em>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.commit_group;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.prefetch</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-prefetch\">\n\n\n<p>Provides a hint to the system to initiate the asynchronous prefetch of data to the cache.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.prefetch.L2.src{.level::cache_hint}   [srcMem], size {, cache-policy}\n\n.src =                { .global }\n.level::cache_hint =  { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.prefetch</span></code> is a non-blocking instruction which may initiate an asynchronous prefetch\nof data from the location specified by source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code>, in <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> statespace, to\nthe L2 cache.</p>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> specifies the amount of memory to be prefetched in terms of number of\nbytes. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> must be a multiple of 16. If the value is not a multiple of 16, then the behavior is\nundefined. The memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[dstMem,</span> <span class=\"pre\">dstMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the destination memory\nspace and the memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[srcMem,</span> <span class=\"pre\">srcMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the source memory\nspace. Otherwise, the behavior is undefined. The address <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> must be aligned to 16 bytes.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.prefetch.L2.global                 [srcMem], size;\n\ncp.async.bulk.prefetch.L2.global.L2::cache_hint  [srcMem], size, policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor\">\n\n\n<p>Provides a hint to the system to initiate the asynchronous prefetch of tensor data to the cache.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// global -&gt; shared::cluster:\ncp.async.bulk.prefetch.tensor.dim.L2.src{.load_mode}{.level::cache_hint} [tensorMap, tensorCoords]\n                                                             {, im2colOffsets } {, cache-policy}\n\n.src =                { .global }\n.dim =                { .1d, .2d, .3d, .4d, .5d }\n.load_mode =          { .tile, .im2col }\n.level::cache_hint =  { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.prefetch.tensor</span></code> is a non-blocking instruction which may initiate an asynchronous\nprefetch of tensor data from the location in <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> statespace to the L2 cache.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is the generic address of the opaque tensor-map object which resides\neither in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> space. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> specifies the properties\nof the tensor copy operation, as described in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>. Refer to\nthe <em>CUDA programming guide</em> for creating the tensor-map objects on the host side.</p>\n<p>The dimension of the tensor data is specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> modifier.</p>\n<p>The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> specifies the starting coordinates in the tensor data in the\nglobal memory from or to which the copy operation has to be performed. The number of tensor\ncoordinates in the vector argument <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> should be equal to the dimension specified by\nthe modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code>. The individual tensor coordinates in <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> specifies how the data in the source location is copied into the\ndestination location. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is not specified, it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>\nmode, the multi-dimensional layout of the source tensor is preserved at the destination. In\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, some dimensions of the source tensors are unrolled in a single dimensional column\nat the destination. Details of the <code class=\"docutils literal notranslate\"><span class=\"pre\">im2col</span></code> mode are described in <a class=\"reference external\" href=\"#tensor-im2col-mode\">Im2col mode</a>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, the tensor has to be at least\n3-dimensional. The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> can be specified only when <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is\n.im2col. The length of the vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> is two less than the number of dimension\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> of the tensor operation.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.prefetch.tensor</span></code> is treated as a weak memory operation in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency\nModel</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b16 ctaMask;\n.reg .u16 i2cOffW, i2cOffH, i2cOffD;\n.reg .b64 l2CachePolicy;\n\ncp.async.bulk.prefetch.tensor.1d.L2.global.tile  [tensorMap0, {tc0}];\n\n@p cp.async.bulk.prefetch.tensor.2d.L2.global    [tensorMap1, {tc0, tc1}];\n\n@p cp.async.bulk.prefetch.tensor.5d.L2.global.im2col\n                      [tensorMap2, {tc0, tc1, tc2, tc3, tc4}], {i2cOffW, i2cOffH, i2cOffD};\n\n@p cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint\n                      [tensorMap3, {tc0, tc1, tc2}], {i2cOffW}, policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.tensor</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-tensor\">\n\n\n<p>Initiates an asynchronous copy operation on the tensor data from one state space to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// global -&gt; shared::cluster:\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.level::cache_hint}\n                                   [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colOffsets}\n                                   {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n.src =                  { .global }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.load_mode =            { .tile, .im2col }\n.level::cache_hint =    { .L2::cache_hint }\n.multicast =            { .multicast::cluster  }\n\n\n// shared::cta -&gt; global:\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.level::cache_hint}\n                                   [tensorMap, tensorCoords], [srcMem] {, cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .bulk_group }\n.load_mode =            { .tile, .im2col_no_offs }\n.level::cache_hint =    { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> is a non-blocking instruction which initiates an asynchronous copy\noperation of tensor data from the location in <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space to the location in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstate space.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> specifies the location in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> state space into which the tensor data\nhas to be copied and <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> specifies the location in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space from which the\ntensor data has to be copied.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is the generic address of the opaque tensor-map object which resides\neither in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> space. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> specifies the properties\nof the tensor copy operation, as described in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>. Refer to\nthe <em>CUDA programming guide</em> for creating the tensor-map objects on the host side.</p>\n<p>The dimension of the tensor data is specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> modifier.</p>\n<p>The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> specifies the starting coordinates in the tensor data in the\nglobal memory from or to which the copy operation has to be performed. The number of tensor\ncoordinates in the vector argument <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> should be equal to the dimension specified by\nthe modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code>. The individual tensor coordinates in <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Completion mechanism</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code></p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td rowspan=\"2\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::...</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td rowspan=\"2\"><p>mbarrier based completion mechanism</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n<td><p><em>Bulk async-group</em> based completion mechanism</p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::complete_tx::bytes</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> variant\nuses mbarrier based completion mechanism. The <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> variant uses <em>bulk\nasync-group</em> based completion mechanism.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> specifies how the data in the source location is copied into the\ndestination location. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is not specified, it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>\nmode, the multi-dimensional layout of the source tensor is preserved at the destination. In\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, some dimensions of the source tensors are unrolled in a single dimensional column\nat the destination. Details of the <code class=\"docutils literal notranslate\"><span class=\"pre\">im2col</span></code> mode are described in <a class=\"reference external\" href=\"#tensor-im2col-mode\">Im2col mode</a>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, the tensor has to be at least\n3-dimensional. The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> can be specified only when <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is\n.im2col. The length of the vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> is two less than the number of dimension\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> of the tensor operation. The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col_no_offs</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode\nexcept there is no <code class=\"docutils literal notranslate\"><span class=\"pre\">im2colOffsets</span></code> vector involved.</p>\n<p>The optional modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.multicast::cluster</span></code> allows copying of data from global memory to shared\nmemory of multiple CTAs in the cluster. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> specifies the destination CTAs in the\ncluster such that each bit position in the 16-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">ctaMask</span></code> operand corresponds to the <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code>\nof the destination CTA. The source data is multicast to the same offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> in the shared\nmemory of each destination CTA. The mbarrier signal is also multicast to the same offset as <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>\nin the shared memory of the destination CTA.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>The copy operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> is treated as a weak memory operation and the\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b16 ctaMask;\n.reg .u16 i2cOffW, i2cOffH, i2cOffD;\n.reg .b64 l2CachePolicy;\n\ncp.async.bulk.tensor.1d.shared::cluster.global.tile  [sMem0], [tensorMap0, {tc0}], [mbar0];\n\n@p cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster\n                     [sMem1], [tensorMap1, {tc0, tc1}], [mbar2], ctaMask;\n\n@p cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes\n                     [sMem2], [tensorMap2, {tc0, tc1, tc2, tc3, tc4}], [mbar2], {i2cOffW, i2cOffH, i2cOffD};\n\n@p cp.async.bulk.tensor.3d.im2col.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint\n                     [sMem3], [tensorMap3, {tc0, tc1, tc2}], [mbar3], {i2cOffW}, policy;\n\n@p cp.async.bulk.tensor.1d.global.shared::cta.bulk_group  [tensorMap3, {tc0}], [sMem3];\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.bulk.wait_group</h1><section id=\"data-movement-and-conversion-instructions-cp-async-bulk-wait-group\">\n\n\n<p>Wait for completion of <em>bulk async-groups</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.wait_group{.read} N;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.wait_group</span></code> instruction will cause the executing thread to wait until only N or\nfewer of the most recent <em>bulk async-groups</em> are pending and all the prior <em>bulk async-groups</em>\ncommitted by the executing threads are complete. For example, when N is 0, the executing thread\nwaits on all the prior <em>bulk async-groups</em> to complete. Operand N is an integer constant.</p>\n<p>By default, <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.wait_group</span></code> instruction will cause the executing thread to wait till\nall the bulk async operations in the specified <em>bulk async-group</em> have completed all of the\nfollowing:</p>\n<ul class=\"simple\">\n<li><p>Reading from the source locations.</p></li>\n<li><p>Writing to their respective destination locations.</p></li>\n<li><p>Writes being made visible to the executing thread.</p></li>\n</ul>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.read</span></code> modifier indicates that the waiting has to be done until all the bulk async\noperations in the specified <em>bulk async-group</em> have completed reading from their source locations.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.bulk.wait_group.read   0;\ncp.async.bulk.wait_group        2;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.commit_group</h1><section id=\"data-movement-and-conversion-instructions-cp-async-commit-group\">\n\n\n<p>Commits all prior initiated but uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> instructions into a <em>cp.async-group</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.commit_group ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.commit_group</span></code> instruction creates a new <em>cp.async-group</em> per thread and batches all\nprior <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> instructions initiated by the executing thread but not committed to any\n<em>cp.async-group</em> into the new <em>cp.async-group</em>. If there are no uncommitted <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\ninstructions then <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.commit_group</span></code> results in an empty <em>cp.async-group.</em></p>\n<p>An executing thread can wait for the completion of all <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations in a <em>cp.async-group</em>\nusing <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code>.</p>\n<p>There is no memory ordering guarantee provided between any two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations within the\nsame <em>cp.async-group</em>. So two or more <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations within a <em>cp.async-group</em> copying data\nto the same location results in undefined behavior.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example 1:\ncp.async.ca.shared.global [shrd], [gbl], 4;\ncp.async.commit_group ; // Marks the end of a cp.async group\n\n// Example 2:\ncp.async.ca.shared.global [shrd1],   [gbl1],   8;\ncp.async.cg.shared.global [shrd1+8], [gbl1+8], 8;\ncp.async.commit_group ; // Marks the end of cp.async group 1\n\ncp.async.ca.shared.global [shrd2],    [gbl2],    16;\ncp.async.cg.shared.global [shrd2+16], [gbl2+16], 16;\ncp.async.commit_group ; // Marks the end of cp.async group 2\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive</h1><section id=\"parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\">\n\n\n<p>Makes the <em>mbarrier object</em> track all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Causes an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> to be\ntriggered by the system on the <em>mbarrier object</em> upon the completion of all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread. The <em>mbarrier object</em> is at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> is\nasynchronous to execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is not specified, the pending count of the mbarrier object is incremented\nby 1 prior to the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>. This\nresults in a zero-net change for the pending count from the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\nduring the current phase. The pending count of the <em>mbarrier object</em> after the increment should not\nexceed the limit as mentioned in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>. Otherwise,\nthe behavior is undefined.</p>\n<p>When the <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is specified, the increment to the pending count of the <em>mbarrier\nobject</em> is not performed. Hence the decrement of the pending count done by the asynchronous\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> must be\naccounted for in the initialization of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example 1: no .noinc\nmbarrier.init.shared.b64 [shMem], threadCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n....\n// Absence of .noinc accounts for arrive-on from completion of prior cp.async operations.\n// So mbarrier.init must only account for arrive-on from mbarrier.arrive.\ncp.async.mbarrier.arrive.shared.b64 [shMem];\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n\n\n\n// Example 2: with .noinc\n\n// Tracks arrive-on from mbarrier.arrive and cp.async.mbarrier.arrive.\n\n// All threads participating in the mbarrier perform cp.async\nmov.b32 copyOperationCnt, threadCount;\n\n// 3 arrive-on operations will be triggered per-thread\nmul.lo.u32 copyArrivalCnt, copyOperationCnt, 3;\n\nadd.u32 totalCount, threadCount, copyArrivalCnt;\n\nmbarrier.init.shared.b64 [shMem], totalCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n...\n// Presence of .noinc requires mbarrier initalization to have accounted for arrive-on from cp.async\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 1st instance\n....\ncp.async.ca.shared.global [shard3], [gbl3], 4;\ncp.async.ca.shared.global [shard4], [gbl4], 16;\ncp.async.mbarrier.arrive.noinc.shared::cta.b64 [shMem]; // 2nd instance\n....\ncp.async.ca.shared.global [shard5], [gbl5], 4;\ncp.async.cg.shared.global [shard6], [gbl6], 16;\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 3rd and last instance\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all</h1><section id=\"data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\">\n<span id=\"data-movement-and-conversion-instructions-cp-async-wait-group\"></span>\n\n<p>Wait for completion of prior asynchronous copy operations.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.wait_group N;\ncp.async.wait_all ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> instruction will cause executing thread to wait till only <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> or fewer of\nthe most recent <em>cp.async-group</em>s are pending and all the prior <em>cp.async-group</em>s committed by\nthe executing threads are complete. For example, when <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is 0, the executing thread waits on all\nthe prior <em>cp.async-group</em>s to complete. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is an integer constant.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> is equivalent to :</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.commit_group;\ncp.async.wait_group 0;\n</pre></div>\n</div>\n<p>An empty <em>cp.async-group</em> is considered to be trivially complete.</p>\n<p>Writes performed by <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations are made visible to the executing thread only after:</p>\n<ol class=\"arabic simple\">\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or</p></li>\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> on the <em>cp.async-group</em> in which the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\nbelongs to or</p></li>\n<li><p><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\">mbarrier.test_wait</a>\nreturns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> on an <em>mbarrier object</em> which is tracking the completion of the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\noperation.</p></li>\n</ol>\n<p>There is no ordering between two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations that are not synchronized with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier objects</a>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> does not provide any ordering and visibility\nguarantees for any other memory operation apart from <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example of .wait_all:\ncp.async.ca.shared.global [shrd1], [gbl1], 4;\ncp.async.cg.shared.global [shrd2], [gbl2], 16;\ncp.async.wait_all;  // waits for all prior cp.async to complete\n\n// Example of .wait_group :\ncp.async.ca.shared.global [shrd3], [gbl3], 8;\ncp.async.commit_group;  // End of group 1\n\ncp.async.cg.shared.global [shrd4], [gbl4], 16;\ncp.async.commit_group;  // End of group 2\n\ncp.async.cg.shared.global [shrd5], [gbl5], 16;\ncp.async.commit_group;  // End of group 3\n\ncp.async.wait_group 1;  // waits for group 1 and group 2 to complete\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.async.wait_group / cp.async.wait_all</h1><section id=\"data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all\">\n<span id=\"data-movement-and-conversion-instructions-cp-async-wait-group\"></span>\n\n<p>Wait for completion of prior asynchronous copy operations.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.wait_group N;\ncp.async.wait_all ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> instruction will cause executing thread to wait till only <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> or fewer of\nthe most recent <em>cp.async-group</em>s are pending and all the prior <em>cp.async-group</em>s committed by\nthe executing threads are complete. For example, when <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is 0, the executing thread waits on all\nthe prior <em>cp.async-group</em>s to complete. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">N</span></code> is an integer constant.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> is equivalent to :</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.commit_group;\ncp.async.wait_group 0;\n</pre></div>\n</div>\n<p>An empty <em>cp.async-group</em> is considered to be trivially complete.</p>\n<p>Writes performed by <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations are made visible to the executing thread only after:</p>\n<ol class=\"arabic simple\">\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or</p></li>\n<li><p>The completion of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> on the <em>cp.async-group</em> in which the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\nbelongs to or</p></li>\n<li><p><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\">mbarrier.test_wait</a>\nreturns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> on an <em>mbarrier object</em> which is tracking the completion of the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>\noperation.</p></li>\n</ol>\n<p>There is no ordering between two <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> operations that are not synchronized with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> or <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier\">mbarrier objects</a>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_group</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.wait_all</span></code> does not provide any ordering and visibility\nguarantees for any other memory operation apart from <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example of .wait_all:\ncp.async.ca.shared.global [shrd1], [gbl1], 4;\ncp.async.cg.shared.global [shrd2], [gbl2], 16;\ncp.async.wait_all;  // waits for all prior cp.async to complete\n\n// Example of .wait_group :\ncp.async.ca.shared.global [shrd3], [gbl3], 8;\ncp.async.commit_group;  // End of group 1\n\ncp.async.cg.shared.global [shrd4], [gbl4], 16;\ncp.async.commit_group;  // End of group 2\n\ncp.async.cg.shared.global [shrd5], [gbl5], 16;\ncp.async.commit_group;  // End of group 3\n\ncp.async.wait_group 1;  // waits for group 1 and group 2 to complete\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.reduce.async.bulk</h1><section id=\"data-movement-and-conversion-instructions-cp-reduce-async-bulk\">\n\n\n<p>Initiates an asynchronous reduction operation.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type\n              [dstMem], [srcMem], size, [mbar]\n\n.dst =                  { .shared::cluster }\n.src =                  { .shared::cta }\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n.redOp=                 { .and, .or, .xor,\n                          .add, .inc, .dec,\n                          .min, .max }\n.type =                 { .b32, .u32, .s32, .b64, .u64 }\n\n\ncp.reduce.async.bulk.dst.src.completion_mechanism{.level::cache_hint}.redOp.type\n               [dstMem], [srcMem], size{, cache-policy}\n\n.dst =                  { .global      }\n.src =                  { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.level::cache_hint    = { .L2::cache_hint }\n.redOp=                 { .and, .or, .xor,\n                          .add, .inc, .dec,\n                          .min, .max }\n.type =                 { .f16, .bf16, .b32, .u32, .s32, .b64, .u64, .s64, .f32, .f64 }\n\n\ncp.reduce.async.bulk.dst.src.completion_mechanism{.level::cache_hint}.add.noftz.type\n               [dstMem], [srcMem], size{, cache-policy}\n.dst  =                 { .global }\n.src  =                 { .shared::cta }\n.completion_mechanism = { .bulk_group }\n.type =                 { .f16, .bf16 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> is a non-blocking instruction which initiates an asynchronous reduction\noperation on an array of memory locations specified by the destination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code>\nwith the source array whose location is specified by the source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code>. The size\nof the source and the destination array must be the same and is specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code>.</p>\n<p>Each data element in the destination array is reduced inline with the corresponding data element in\nthe source array with the reduction operation specified by the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code>. The type of each\ndata element in the source and the destination array is specified by the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>.</p>\n<p>The source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> is located in the state space specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> and the\ndestination address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> is located in the state specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>.</p>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> specifies the amount of memory to be copied from the source location and\nused in the reduction operation, in terms of number of bytes. <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> must be a multiple of 16. If\nthe value is not a multiple of 16, then the behavior is undefined. The memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[dstMem,</span>\n<span class=\"pre\">dstMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the destination memory space and the memory range <code class=\"docutils literal notranslate\"><span class=\"pre\">[srcMem,</span>\n<span class=\"pre\">srcMem</span> <span class=\"pre\">+</span> <span class=\"pre\">size</span> <span class=\"pre\">-</span> <span class=\"pre\">1]</span></code> must not overflow the source memory space. Otherwise, the behavior is\nundefined. The addresses <code class=\"docutils literal notranslate\"><span class=\"pre\">dstMem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> must be aligned to 16 bytes.</p>\n<p>The operations supported by <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> are classified as follows:</p>\n<ul class=\"simple\">\n<li><p>The bit-size operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code>.</p></li>\n<li><p>The integer operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> operations return a result in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">[0..x]</span></code> where <code class=\"docutils literal notranslate\"><span class=\"pre\">x</span></code> is the value at the source\nstate space.</p></li>\n<li><p>The floating point operation <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code> rounds to the nearest even. The current implementation of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.add.f32</span></code> flushes subnormal inputs and results to sign-preserving zero. The\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.add.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.add.bf16</span></code> operations require\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.noftz</span></code> qualifier. It preserves input and result subnormals, and does not flush them to zero.</p></li>\n</ul>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> and element type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 18%\"/>\n<col style=\"width: 24%\"/>\n<col style=\"width: 58%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code></p></th>\n<th class=\"head\"><p>Element type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td rowspan=\"4\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td rowspan=\"4\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. The completion mechanisms that are supported for different variants are\nsummarized in the following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 43%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Completion mechanism</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code></p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td rowspan=\"2\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::...</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td rowspan=\"2\"><p>mbarrier based completion mechanism</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></td>\n<td><p><em>Bulk async-group</em> based completion mechanism</p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier::complete_tx::bytes</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> variant\nuses mbarrier based completion mechanism. The <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data copied in bytes, will be\nperformed on the mbarrier object specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> specifies that the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> variant uses <em>bulk\nasync-group</em> based completion mechanism.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>Each reduction operation performed by the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> has individually <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed.gpu</span></code>\nmemory ordering semantics. The load operations in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk</span></code> are treated as weak\nmemory operation and the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.add.u64\n                                                                  [dstMem], [srcMem], size, [mbar];\n\ncp.reduce.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes.min.s32\n                                                                  [dstMem], [srcMem], size, [mbar];\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.min.f16 [dstMem], [srcMem], size;\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.xor.s32 [dstMem], [srcMem], size, policy;\n\ncp.reduce.async.bulk.global.shared::cta.bulk_group.add.noftz.f16 [dstMem], [srcMem], size;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cp.reduce.async.bulk.tensor</h1><section id=\"data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor\">\n\n\n<p>Initiates an asynchronous reduction operation on the tensor data.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// shared::cta -&gt; global:\ncp.reduce.async.bulk.tensor.dim.dst.src.redOp{.load_mode}.completion_mechanism{.level::cache_hint}\n                                          [tensorMap, tensorCoords], [srcMem] {,cache-policy}\n\n.dst =                  { .global }\n.src =                  { .shared::cta }\n.dim =                  { .1d, .2d, .3d, .4d, .5d }\n.completion_mechanism = { .bulk_group }\n.load_mode =            { .tile, .im2col_no_offs }\n.redOp =                { .add, .min, .max, .inc, .dec, .and, .or, .xor}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code> is a non-blocking instruction which initiates an asynchronous\nreduction operation of tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> state space with tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code>\nstate space.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">srcMem</span></code> specifies the location of the tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space using\nwhich the reduction operation has to be performed.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> is the generic address of the opaque tensor-map object which resides\neither in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space or <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> space. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorMap</span></code> specifies the properties\nof the tensor reduce operation, as described in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>. Refer\nto the <em>CUDA programming guide</em> for creating the tensor-map objects on the host side.</p>\n<p>Each element of the tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code> state space is reduced inline with the corresponding\nelement from the tensor data in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> state space. The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> specifies the\nreduction operation used for the inline reduction. The type of each tensor data element in the\nsource and the destination tensor is specified in <a class=\"reference external\" href=\"#tensor-tensormap\">Tensor-map</a>.</p>\n<p>The dimension of the tensor is specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code> modifier.</p>\n<p>The vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> specifies the starting coordinates of the tensor data in the\nglobal memory on which the reduce operation is to be performed. The number of tensor coordinates in\nthe vector argument <code class=\"docutils literal notranslate\"><span class=\"pre\">tensorCoords</span></code> should be equal to the dimension specified by the modifier\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dim</span></code>. The individual tensor coordinates are of the type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code> and element type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 32%\"/>\n<col style=\"width: 68%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.redOp</span></code></p></th>\n<th class=\"head\"><p>Element type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n</tbody>\n</table>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies the completion mechanism that is supported on the\ninstruction variant. Value <code class=\"docutils literal notranslate\"><span class=\"pre\">.bulk_group</span></code> of the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies that\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code> instruction uses <em>bulk async-group</em> based completion mechanism.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> specifies how the data in the source location is copied into the\ndestination location. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.load_mode</span></code> is not specified, it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.tile</span></code>\nmode, the multi-dimensional layout of the source tensor is preserved at the destination. In\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col_no_offs</span></code> mode, some dimensions of the source tensors are unrolled in a single dimensional\ncolumn at the destination. Details of the <code class=\"docutils literal notranslate\"><span class=\"pre\">im2col</span></code> mode are described in <a class=\"reference external\" href=\"#tensor-im2col-mode\">Im2col mode</a>. In <code class=\"docutils literal notranslate\"><span class=\"pre\">.im2col</span></code> mode, the tensor has to be at least\n3-dimensional.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program. The\nqualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported when at least one of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.src</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.dst</span></code>\nstatespaces is <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p>Each reduction operation performed by <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code> has individually\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed.gpu</span></code> memory ordering semantics. The load operations in <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.reduce.async.bulk.tensor</span></code>\nare treated as weak memory operations and the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group\n                                             [tensorMap0, {tc0}], [sMem0];\n\ncp.reduce.async.bulk.tensor.2d.global.shared::cta.and.bulk_group.L2::cache_hint\n                                             [tensorMap1, {tc0, tc1}], [sMem1] , policy;\n\ncp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col.bulk_group\n                                             [tensorMap2, {tc0, tc1, tc2}], [sMem2]\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: cp.async\n\n\n\nInitiates an asynchronous copy operation from one state space to another.\n\nSyntax\n\ncp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n\n                         [dst], [src], cp-size{, src-size}{, cache-policy} ;\n\ncp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size}\n\n                         [dst], [src], 16{, src-size}{, cache-policy} ;\n\ncp.async.ca.shared...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk\n\n\n\nInitiates an asynchronous copy operation from one state space to another.\n\nSyntax\n\ncp.async.bulk.dst.src.completion_mechanism{.multicast}{.level::cache_hint}\n\n                      [dstMem], [srcMem], size, [mbar] {, ctaMask} {, cache-policy}\n\n.dst =                  { .shared::cluster }\n\n.src =                  { .global }\n\n.completion_mechanism = { .mbarrier::complete_tx::bytes }\n\n.level::cache_hint = ...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.commit_group\n\n\n\nCommits all prior initiated but uncommitted cp.async.bulk instructions into a\n\ncp.async.bulk-group.\n\nSyntax\n\ncp.async.bulk.commit_group;\n\nDescription\n\ncp.async.bulk.commit_group instruction creates a new per-thread bulk async-group and batches\n\nall prior cp{.reduce}.async.bulk.{.prefetch}{.tensor} instructions satisfying the following\n\nconditions into the new bulk async-group:\n\nThe prior cp{.reduce}.async...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.prefetch\n\n\n\nProvides a hint to the system to initiate the asynchronous prefetch of data to the cache.\n\nSyntax\n\ncp.async.bulk.prefetch.L2.src{.level::cache_hint}   [srcMem], size {, cache-policy}\n\n.src =                { .global }\n\n.level::cache_hint =  { .L2::cache_hint }\n\nDescription\n\ncp.async.bulk.prefetch is a non-blocking instruction which may initiate an asynchronous prefetch\n\nof data from the location specifie...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.prefetch.tensor\n\n\n\nProvides a hint to the system to initiate the asynchronous prefetch of tensor data to the cache.\n\nSyntax\n\n// global -> shared::cluster:\n\ncp.async.bulk.prefetch.tensor.dim.L2.src{.load_mode}{.level::cache_hint} [tensorMap, tensorCoords]\n\n                                                             {, im2colOffsets } {, cache-policy}\n\n.src =                { .global }\n\n.dim =                { .1d, .2d, .3...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.tensor\n\n\n\nInitiates an asynchronous copy operation on the tensor data from one state space to another.\n\nSyntax\n\n// global -> shared::cluster:\n\ncp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.level::cache_hint}\n\n                                   [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colOffsets}\n\n                                   {, ctaMask} {, cache-policy}\n\n.dst =      ...\n\n=====Data Movement and Conversion Instructions: cp.async.bulk.wait_group\n\n\n\nWait for completion of bulk async-groups.\n\nSyntax\n\ncp.async.bulk.wait_group{.read} N;\n\nDescription\n\ncp.async.bulk.wait_group instruction will cause the executing thread to wait until only N or\n\nfewer of the most recent bulk async-groups are pending and all the prior bulk async-groups\n\ncommitted by the executing threads are complete. For example, when N is 0, the executing thread\n\nwaits on all the prior b...\n\n=====Data Movement and Conversion Instructions: cp.async.commit_group\n\n\n\nCommits all prior initiated but uncommitted cp.async instructions into a cp.async-group.\n\nSyntax\n\ncp.async.commit_group ;\n\nDescription\n\ncp.async.commit_group instruction creates a new cp.async-group per thread and batches all\n\nprior cp.async instructions initiated by the executing thread but not committed to any\n\ncp.async-group into the new cp.async-group. If there are no uncommitted cp.async\n\ninstructio...\n\n=====Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive\n\n\n\nMakes the mbarrier object track all prior cp.async operations initiated by the\n\nexecuting thread.\n\nSyntax\n\ncp.async.mbar ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async"
            };

        case "createpolicy":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy\" target=\"_blank\" rel=\"noopener noreferrer\">createpolicy <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: createpolicy</h1><section id=\"data-movement-and-conversion-instructions-createpolicy\">\n\n\n<p>Create a cache eviction policy for the specified cache level.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Range-based policy\ncreatepolicy.range{.global}.level::primary_priority{.level::secondary_priority}.b64\n                                   cache-policy, [a], primary-size, total-size;\n\n// Fraction-based policy\ncreatepolicy.fractional.level::primary_priority{.level::secondary_priority}.b64\n                                   cache-policy{, fraction};\n\n// Converting the access property from CUDA APIs\ncreatepolicy.cvt.L2.b64            cache-policy, access-property;\n\n.level::primary_priority =   { .L2::evict_last, .L2::evict_normal,\n                               .L2::evict_first, .L2::evict_unchanged };\n.level::secondary_priority = { .L2::evict_first, .L2::evict_unchanged };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">createpolicy</span></code> instruction creates a cache eviction policy for the specified cache level in an\nopaque 64-bit register specified by the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code>. The cache eviction\npolicy specifies how cache eviction priorities are applied to global memory addresses used in memory\noperations with <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier.</p>\n<p>There are two types of cache eviction policies:</p>\n<ul>\n<li><p>Range-based policy</p>\n<p>The cache eviction policy created using <code class=\"docutils literal notranslate\"><span class=\"pre\">createpolicy.range</span></code> specifies the cache eviction\nbehaviors for the following three address ranges:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">..</span> <span class=\"pre\">a</span> <span class=\"pre\">+</span> <span class=\"pre\">(primary-size</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> referred to as primary range.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">+</span> <span class=\"pre\">primary-size</span> <span class=\"pre\">..</span> <span class=\"pre\">a</span> <span class=\"pre\">+</span> <span class=\"pre\">(total-size</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> referred to as trailing secondary range.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">-</span> <span class=\"pre\">(total-size</span> <span class=\"pre\">-</span> <span class=\"pre\">primary-size)</span> <span class=\"pre\">..</span> <span class=\"pre\">(a</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> referred to as preceding secondary range.</p></li>\n</ul>\n<p>When a range-based cache eviction policy is used in a memory operation with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier, the eviction priorities are applied as follows:</p>\n<ul class=\"simple\">\n<li><p>If the memory address falls in the primary range, the eviction priority specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::primary_priority</span></code> is applied.</p></li>\n<li><p>If the memory address falls in any of the secondary ranges, the eviction priority specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code> is applied.</p></li>\n<li><p>If the memory address does not fall in either of the above ranges, then the applied eviction\npriority is unspecified.</p></li>\n</ul>\n<p>The 32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">primary-size</span></code> specifies the size, in bytes, of the primary range. The\n32-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">total-size</span></code> specifies the combined size, in bytes, of the address range\nincluding primary and secondary ranges. The value of <code class=\"docutils literal notranslate\"><span class=\"pre\">primary-size</span></code> must be less than or equal\nto the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">total-size</span></code>. Maximum allowed value of <code class=\"docutils literal notranslate\"><span class=\"pre\">total-size</span></code> is 4GB.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code> is not specified, then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::evict_unchanged</span></code>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the specified address does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space\nthen the behavior is undefined.</p>\n</li>\n<li><p>Fraction-based policy</p>\n<p>A memory operation with <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier can use the fraction-based cache\neviction policy to request the cache eviction priority specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2:primary_priority</span></code> to\nbe applied to a fraction of cache accesses specified by the 32-bit floating point operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code>. The remainder of the cache accesses get the eviction priority specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code>. This implies that in a memory operation that uses a fraction-based\ncache policy, the memory access has a probability specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code> of\ngetting the cache eviction priority specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::primary_priority</span></code>.</p>\n<p>The valid range of values for the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">(0.0,..,</span> <span class=\"pre\">1.0]</span></code>. If the operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fraction</span></code> is not specified, it defaults to 1.0.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::secondary_priority</span></code> is not specified, then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::evict_unchanged</span></code>.</p>\n</li>\n</ul>\n<p>The access property created using the CUDA APIs can be converted into cache eviction policy by the\ninstruction <code class=\"docutils literal notranslate\"><span class=\"pre\">createpolicy.cvt</span></code>. The source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">access-property</span></code> is a 64-bit opaque\nregister. Refer to <em>CUDA programming guide</em> for more details.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>createpolicy.fractional.L2::evict_last.b64                      policy, 1.0;\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64  policy, 0.5;\n\ncreatepolicy.range.L2::evict_last.L2::evict_first.b64\n                                            policy, [ptr], 0x100000, 0x200000;\n\n// access-prop is created by CUDA APIs.\ncreatepolicy.cvt.L2.b64 policy, access-prop;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Create a cache eviction policy for the specified cache level.\n\nSyntax\n\n// Range-based policy\n\ncreatepolicy.range{.global}.level::primary_priority{.level::secondary_priority}.b64\n\n                                   cache-policy, [a], primary-size, total-size;\n\n// Fraction-based policy\n\ncreatepolicy.fractional.level::primary_priority{.level::secondary_priority}.b64\n\n                                   cache-policy{, fraction};\n\n// Converting the access property from CUDA APIs\n\ncreatepolicy.cvt.L2.b64            cache-policy, access-property;\n\n.level::primary_priority =   { .L2::evict_last, .L2::evict_normal,\n\n                               .L2::evict_first, .L2::evict_unchanged };\n\n.level::secondary_priority = { .L2::evict_first, .L2::evict_unchanged };\n\nDescription\n\nThe createpolicy instruction creates a cache eviction policy for the specified cache level in an\n\nopaque 64-bit register specified by the destination operand cache-policy. The cache eviction\n\npolicy specifies how cache eviction priorities are applied to global memory addresses used in memory\n\noperations with .level::cache_hint qualifier.\n\nThere are two types of cache eviction policies:\n\nRange-based policy\n\nThe cache eviction policy created using createpolicy.range specifies the cache eviction\n\nbehaviors for the following three address ranges:\n\n[a .. a + (primary-size - 1)] referred to as primary range.\n\n[a + primary-size .. a + (total-size - 1)] referred to as trailing secondary range.\n\n[a - (total-size - primary-size) .. (a - 1)] referred to as preceding secondary range.\n\nWhen a range-based cache eviction policy is used in a memory operation with\n\n.level::cache_hint qualifier, the eviction priorities are applied as follows:\n\nIf the memory address falls in the primary range, the eviction priority specified by\n\n.L2::primary_priority is applied.\n\nIf the memory address falls in any of the secondary ranges, the eviction priority specified by\n\n.L2::secondary_priority is applied.\n\nIf the memory address does not fall in either of the above ranges, then the applied eviction\n\npriority is unspecified.\n\nThe 32-bit operand primary-size specifies the size, in bytes, of the primary range. The\n\n32-bit operand total-size specifies the combined size, in bytes, of the address range\n\nincluding primary and secondary ranges. The value of primary-size must be less than or equal\n\nto the value of total-size. Maximum allowed value of total-size is 4GB.\n\nIf .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nFraction-based policy\n\nA memory operation with .level::cache_hint qualifier can use the fraction-based cache\n\neviction policy to request the cache eviction priority specified by .L2:primary_priority to\n\nbe applied to a fraction of cache accesses specified by the 32-bit floating point operand\n\nfraction. The remainder of the cache accesses get the eviction priority specified by\n\n.L2::secondary_priority. This implies that in a memory operation that uses a fraction-based\n\ncache policy, the memory access has a probability specified by the operand fraction of\n\ngetting the cache eviction priority specified by .L2::primary_priority.\n\nThe valid range of values for the operand fraction is (0.0,.., 1.0]. If the operand\n\nfraction is not specified, it defaults to 1.0.\n\nIf .L2::secondary_priority is not specified, then it defaults to .L2::evict_unchanged.\n\nThe access property created using the CUDA APIs can be converted into cache eviction policy by the\n\ninstruction createpolicy.cvt. The source operand access-property is a 64-bit opaque\n\nregister. Refer to CUDA programming guide for more details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\ncreatepolicy.fractional.L2::evict_last.b64                      policy, 1.0;\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64  polic ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-createpolicy"
            };

        case "ctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ctaid\" target=\"_blank\" rel=\"noopener noreferrer\">ctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %ctaid</h1><section id=\"special-registers-ctaid\">\n\n\n<p>CTA identifier within a grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %ctaid;                      // CTA id vector\n.sreg .u32 %ctaid.x, %ctaid.y, %ctaid.z;    // CTA id components\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the CTA identifier within the CTA\ngrid. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code> special register contains a 1D, 2D, or 3D vector, depending on the shape and\nrank of the CTA grid. The fourth element is unused and always returns zero.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>0  &lt;=  %ctaid.x &lt;  %nctaid.x\n0  &lt;=  %ctaid.y &lt;  %nctaid.y\n0  &lt;=  %ctaid.z &lt;  %nctaid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r0,%ctaid.x;\nmov.u16  %rh,%ctaid.y;   // legacy code\n</pre></div>\n</div>\n</section>",
                "tooltip": "CTA identifier within a grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %ctaid;                      // CTA id vector\n\n.sreg .u32 %ctaid.x, %ctaid.y, %ctaid.z;    // CTA id components\n\nDescription\n\nA predefined, read-only special register initialized with the CTA identifier within the CTA\n\ngrid. The %ctaid special register contains a 1D, 2D, or 3D vector, depending on the shape and\n\nrank of the CTA grid. The fourth element is unused and always returns zero.\n\nIt is guaranteed that:\n\n0  <=  %ctaid.x <  %nctaid.x\n\n0  <=  %ctaid.y <  %nctaid.y\n\n0  <=  %ctaid.z <  %nctaid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%ctaid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r0,%ctaid.x;\n\nmov.u16  %rh,%ctaid.y;   // legacy code\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ctaid"
            };

        case "current_graph_exec":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-current-graph-exec\" target=\"_blank\" rel=\"noopener noreferrer\">current_graph_exec <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %current_graph_exec</h1><section id=\"special-registers-current-graph-exec\">\n\n\n<p>An Identifier for currently executing CUDA device graph.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %current_graph_exec;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the identifier referring to the CUDA\ndevice graph being currently executed. This register is 0 if the executing kernel is not part of a\nCUDA device graph.</p>\n<p>Refer to the <em>CUDA Programming Guide</em> for more details on CUDA device graphs.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u64  r1, %current_graph_exec;\n</pre></div>\n</div>\n</section>",
                "tooltip": "An Identifier for currently executing CUDA device graph.\n\nSyntax (predefined)\n\n.sreg .u64 %current_graph_exec;\n\nDescription\n\nA predefined, read-only special register initialized with the identifier referring to the CUDA\n\ndevice graph being currently executed. This register is 0 if the executing kernel is not part of a\n\nCUDA device graph.\n\nRefer to the CUDA Programming Guide for more details on CUDA device graphs.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_50 or higher.\n\nExamples\n\nmov.u64  r1, %current_graph_exec;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-current-graph-exec"
            };

        case "cvt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt\" target=\"_blank\" rel=\"noopener noreferrer\">cvt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack\" target=\"_blank\" rel=\"noopener noreferrer\">cvt.pack <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: cvt</h1><section id=\"data-movement-and-conversion-instructions-cvt\">\n\n\n<p>Convert a value from one type to another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cvt{.irnd}{.ftz}{.sat}.dtype.atype         d, a;  // integer rounding\ncvt{.frnd}{.ftz}{.sat}.dtype.atype         d, a;  // fp rounding\ncvt.frnd2{.relu}{.satfinite}.f16.f32       d, a;\ncvt.frnd2{.relu}{.satfinite}.f16x2.f32     d, a, b;\ncvt.frnd2{.relu}{.satfinite}.bf16.f32      d, a;\ncvt.frnd2{.relu}{.satfinite}.bf16x2.f32    d, a, b;\ncvt.rna{.satfinite}.tf32.f32               d, a;\ncvt.frnd2{.relu}.tf32.f32                  d, a;\ncvt.rn.satfinite{.relu}.f8x2type.f32       d, a, b;\ncvt.rn.satfinite{.relu}.f8x2type.f16x2     d, a;\ncvt.rn.{.relu}.f16x2.f8x2type              d, a;\n\n.irnd   = { .rni, .rzi, .rmi, .rpi };\n.frnd   = { .rn,  .rz,  .rm,  .rp  };\n.frnd2  = { .rn,  .rz };\n.dtype = .atype = { .u8,   .u16, .u32, .u64,\n                    .s8,   .s16, .s32, .s64,\n                    .bf16, .f16, .f32, .f64 };\n.f8x2type = { .e4m3x2, .e5m2x2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Convert between different types and sizes.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, two inputs <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> of <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> type are\nconverted into <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> type and the converted values are packed in the destination\nregister <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, such that the value converted from input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper half of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>\nand the value converted from input <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is stored in the lower half of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code></p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> instruction type, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>When converting to <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> data formats, the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>\ntype. When converting two <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> inputs to <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code>, each input is converted to the\nspecified format, and the converted values are packed in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> such that the\nvalue converted from input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the value converted from\ninput <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is stored in the lower 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. When converting an <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> input to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code>, each <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> input from operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is converted to the specified\nformat. The converted values are packed in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> such that the value\nconverted from the upper 16 bits of input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the value\nconverted from the lower 16 bits of input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the lower 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>When converting from <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>\ntype. Each 8-bit input value in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type. The converted values\nare packed in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> such that the value converted from the upper 8 bits of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in the upper 16 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the value converted from the lower 8 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>\nis stored in the lower 16 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Rounding modifier is mandatory in all of the following cases:</p>\n<ul class=\"simple\">\n<li><p>float-to-float conversions, when destination type is smaller than source type</p></li>\n<li><p>All float-to-int conversions</p></li>\n<li><p>All int-to-float conversions</p></li>\n<li><p>All conversions involving <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2,</span> <span class=\"pre\">.e5m2x2,</span></code><code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> instruction\ntypes.</p></li>\n</ul>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.satfinite</span></code> modifier is only supported for conversions involving the following types:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> destination types. <code class=\"docutils literal notranslate\"><span class=\"pre\">.satfinite</span></code> modifier is mandatory for such\nconversions.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> as destination types.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> as destination type with rounding mode specified as round to nearest, ties away from\nzero.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (/* inst type is .f16x2 or .bf16x2 */) {\n    d[31:16] = convert(a);\n    d[15:0]  = convert(b);\n} else {\n    d = convert(a);\n}\n</pre></div>\n</div>\n<p><strong>Integer Notes</strong></p>\n<p>Integer rounding is required for float-to-integer conversions, and for same-size float-to-float\nconversions where the value is rounded to an integer. Integer rounding is illegal in all other\ninstances.</p>\n<p>Integer rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rni</span></code></dt><dd><p>round to nearest integer, choosing even integer if source is equidistant between two integers</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rzi</span></code></dt><dd><p>round to nearest integer in the direction of zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rmi</span></code></dt><dd><p>round to nearest integer in direction of negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rpi</span></code></dt><dd><p>round to nearest integer in direction of positive infinity</p>\n</dd>\n</dl>\n<p>In float-to-integer conversion, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs are converted to 0.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.dtype.f32</span></code> float-to-integer conversions and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.f32.f32</span></code> float-to-float\nconversions with integer rounding, subnormal inputs are flushed to sign-preserving zero. Modifier\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> can only be specified when either <code class=\"docutils literal notranslate\"><span class=\"pre\">.dtype</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> and applies only\nto single precision (<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>) inputs and results.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.dtype.f32</span></code> float-to-integer conversions and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.ftz.f32.f32</span></code>\nfloat-to-float conversions with integer rounding, subnormal inputs are flushed to sign-preserving\nzero. The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> modifier may be specified in these cases for clarity.</p>\n<p><strong>Note:</strong> In PTX ISA versions 1.4 and earlier, the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instruction did not flush single-precision\nsubnormal inputs or results to zero if the destination type size was 64-bits. The compiler will\npreserve this behavior for legacy PTX code.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt><dd><p>For integer destination types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> limits the result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> for the size of\nthe operation. Note that saturation applies to both signed and unsigned integer types.</p>\n<p>The saturation modifier is allowed only in cases where the destination type\u2019s value range is not\na superset of the source type\u2019s value range; i.e., the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> modifier is illegal in cases\nwhere saturation is not possible based on the source and destination types.</p>\n<p>For float-to-integer conversions, the result is clamped to the destination range by default; i.e,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> is redundant.</p>\n</dd>\n</dl>\n<p><strong>Floating Point Notes</strong></p>\n<p>Floating-point rounding is required for float-to-float conversions that result in loss of precision,\nand for integer-to-float conversions. Floating-point rounding is illegal in all other instances.</p>\n<p>Floating-point rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rna</span></code></dt><dd><p>mantissa LSB rounds to nearest, ties away from zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>A floating-point value may be rounded to an integral value using the integer rounding modifiers (see\nInteger Notes). The operands must be of the same size. The result is an integral value, stored in\nfloating-point format.</p>\n<p>Subnormal numbers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported. Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> may be specified to flush\nsingle-precision subnormal inputs and results to sign-preserving zero. Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> can only\nbe specified when either <code class=\"docutils literal notranslate\"><span class=\"pre\">.dtype</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> and applies only to single\nprecision (<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>) inputs and results.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p>Single-precision subnormal inputs and results are flushed to sign-preserving zero. The optional\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> modifier may be specified in these cases for clarity.</p>\n</dd>\n</dl>\n<p><strong>Note:</strong> In PTX ISA versions 1.4 and earlier, the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instruction did not flush\nsingle-precision subnormal inputs or results to zero if either source or destination type was\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>. The compiler will preserve this behavior for legacy PTX code. Specifically, if the PTX\nISA version is 1.4 or earlier, single-precision subnormal inputs and results are flushed to\nsign-preserving zero only for <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f32.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f16.f32</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f32.f32</span></code> instructions.</p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code>:</dt><dd><p>For floating-point destination types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> limits the result to the range [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>\nresults are flushed to positive zero. Applies to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> types.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code>:</dt><dd><p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>\ndestination types, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code> clamps the result to 0 if negative. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are converted to\ncanonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.satfinite</span></code>:</dt><dd><p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>\ndestination formats, if the input value is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, then the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> in the specified\ndestination format. If the absolute value of input (ignoring sign) is greater than <em>MAX_NORM</em> of\nthe specified destination format, then the result is sign-preserved <em>MAX_NORM</em> of the destination\nformat.</p>\n</dd>\n</dl>\n<p><strong>Notes</strong></p>\n<p>A source register wider than the specified type may be used, except when the source operand has\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> format. The lower <code class=\"docutils literal notranslate\"><span class=\"pre\">n</span></code> bits corresponding to the instruction-type width\nare used in the conversion. See <a class=\"reference external\" href=\"#operand-size-exceeding-instruction-type-size\">Operand Size Exceeding Instruction-Type Size</a> for a description of these relaxed\ntype-checking rules.</p>\n<p>A destination register wider than the specified type may be used, except when the destination\noperand has <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code> format. The result of conversion is sign-extended to\nthe destination register width for signed integers, and is zero-extended to the destination register\nwidth for unsigned, bit-size, and floating-point types. See <a class=\"reference external\" href=\"#operand-size-exceeding-instruction-type-size\">Operand Size Exceeding Instruction-Type\nSize</a> for a description of these relaxed\ntype-checking rules.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.f32.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> input yields unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code> modifier and {<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>} destination formats\nintroduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.bf16.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64/bf16}</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64}.bf16</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.tf32.f32.{relu}.{rn/rz}</span></code> introduced\nin PTX ISA 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher introduced in PTX ISA version 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{e4m3x2,</span> <span class=\"pre\">e5m2x2}.{f32,</span> <span class=\"pre\">f16x2}</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher introduced in PTX ISA version 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> introduced in PTX ISA version 8.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{e4m3x2,</span> <span class=\"pre\">e5m2x2}.{f32,</span> <span class=\"pre\">f16x2}</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> introduced in PTX ISA version 8.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{f16,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16x2,</span> <span class=\"pre\">tf32}.f32</span></code> introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> to or from <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.relu</span></code> modifier and {<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.tf32</span></code>} destination formats require\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.bf16.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64/bf16}</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.{u8/s8/u16/s16/u32/s32/u64/s64/f16/f64}.bf16</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.tf32.f32.{relu}.{rn/rz}</span></code> require\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.e4m3x2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.e5m2x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm89</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.satfinite.{e4m3x2,</span> <span class=\"pre\">e5m2x2}.{f32,</span> <span class=\"pre\">f16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cvt.f32.s32 f,i;\ncvt.s32.f64 j,r;     // float-to-int saturates by default\ncvt.rni.f32.f32 x,y; // round to nearest int, result is fp\ncvt.f32.f32 x,y;     // note .ftz behavior for sm_1x targets\ncvt.rn.relu.f16.f32      b, f;        // result is saturated with .relu saturation mode\ncvt.rz.f16x2.f32         b1, f, f1;   // convert two fp32 values to packed fp16 outputs\ncvt.rn.relu.satfinite.f16x2.f32    b1, f, f1;   // convert two fp32 values to packed fp16 outputs with .relu saturation on each output\ncvt.rn.bf16.f32          b, f;        // convert fp32 to bf16\ncvt.rz.relu.satfinite.bf16.f3 2    b, f;        // convert fp32 to bf16 with .relu and .satfinite saturation\ncvt.rz.satfinite.bf16x2.f32        b1, f, f1;   // convert two fp32 values to packed bf16 outputs\ncvt.rn.relu.bf16x2.f32   b1, f, f1;   // convert two fp32 values to packed bf16 outputs with .relu saturation on each output\ncvt.rna.satfinite.tf32.f32         b1, f;       // convert fp32 to tf32 format\ncvt.rn.relu.tf32.f32     d, a;        // convert fp32 to tf32 format\ncvt.f64.bf16.rp          f, b;        // convert bf16 to f64 format\ncvt.bf16.f16.rz          b, f         // convert f16 to bf16 format\ncvt.bf16.u64.rz          b, u         // convert u64 to bf16 format\ncvt.s8.bf16.rpi          s, b         // convert bf16 to s8 format\ncvt.bf16.bf16.rpi        b1, b2       // convert bf16 to corresponding int represented in bf16 format\ncvt.rn.satfinite.e4m3x2.f32 d, a, b;  // convert a, b to .e4m3 and pack as .e4m3x2 output\ncvt.rn.relu.satfinite.e5m2x2.f16x2 d, a; // unpack a and convert the values to .e5m2 outputs with .relu\n                                         // saturation on each output and pack as .e5m2x2\ncvt.rn.f16x2.e4m3x2 d, a;             // unpack a, convert two .e4m3 values to packed f16x2 output\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: cvt.pack</h1><section id=\"data-movement-and-conversion-instructions-cvt-pack\">\n\n\n<p>Convert two integer values from one integer type to another and pack the results.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cvt.pack.sat.convertType.abType  d, a, b;\n    .convertType  = { .u16, .s16 }\n    .abType       = { .s32 }\n\ncvt.pack.sat.convertType.abType.cType  d, a, b, c;\n    .convertType  = { .u2, .s2, .u4, .s4, .u8, .s8 }\n    .abType       = { .s32 }\n    .cType        = { .b32 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Convert two 32-bit integers <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> into specified type and pack the results into <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is an unsigned 32-bit integer. Source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are integers of\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.abType</span></code> and the source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is an integer of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.cType</span></code>.</p>\n<p>The inputs <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are converted to values of type specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.convertType</span></code> with\nsaturation and the results after conversion are packed into lower bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>If operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is specified then remaining bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are copied from lower bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ta = a &lt; MIN(convertType) ? MIN(convertType) : a;\nta = a &gt; MAX(convertType) ? MAX(convertType) : a;\ntb = b &lt; MIN(convertType) ? MIN(convertType) : b;\ntb = b &gt; MAX(convertType) ? MAX(convertType) : b;\n\nsize = sizeInBits(convertType);\ntd = tb ;\nfor (i = size; i &lt;= 2 * size - 1; i++) {\n    td[i] = ta[i - size];\n}\n\nif (isU16(convertType) || isS16(convertType)) {\n    d = td;\n} else {\n    for (i = 0; i &lt; 2 * size; i++) {\n        d[i] = td[i];\n    }\n    for (i = 2 * size; i &lt;= 31; i++) {\n        d[i] = c[i - 2 * size];\n    }\n}\n</pre></div>\n</div>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code> modifier limits the converted values to <code class=\"docutils literal notranslate\"><span class=\"pre\">MIN(convertType)</span></code>..<code class=\"docutils literal notranslate\"><span class=\"pre\">MAX(convertedType)</span></code> (no\noverflow) if the corresponding inputs are not in the range of datatype specified as\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.convertType</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.5.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_72</span></code> or higher.</p>\n<p>Sub byte types (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u4</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s4</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.u2</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s2</span></code>) requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cvt.pack.sat.s16.s32      %r1, %r2, %r3;           // 32-bit to 16-bit conversion\ncvt.pack.sat.u8.s32.b32   %r4, %r5, %r6, 0;        // 32-bit to 8-bit conversion\ncvt.pack.sat.u8.s32.b32   %r7, %r8, %r9, %r4;      // %r7 = { %r5, %r6, %r8, %r9 }\ncvt.pack.sat.u4.s32.b32   %r10, %r12, %r13, %r14;  // 32-bit to 4-bit conversion\ncvt.pack.sat.s2.s32.b32   %r15, %r16, %r17, %r18;  // 32-bits to 2-bit conversion\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: cvt\n\n\n\nConvert a value from one type to another.\n\nSyntax\n\ncvt{.irnd}{.ftz}{.sat}.dtype.atype         d, a;  // integer rounding\n\ncvt{.frnd}{.ftz}{.sat}.dtype.atype         d, a;  // fp rounding\n\ncvt.frnd2{.relu}{.satfinite}.f16.f32       d, a;\n\ncvt.frnd2{.relu}{.satfinite}.f16x2.f32     d, a, b;\n\ncvt.frnd2{.relu}{.satfinite}.bf16.f32      d, a;\n\ncvt.frnd2{.relu}{.satfinite}.bf16x2.f32    d, a, b;\n\ncvt.rna{.satfi...\n\n=====Data Movement and Conversion Instructions: cvt.pack\n\n\n\nConvert two integer values from one integer type to another and pack the results.\n\nSyntax\n\ncvt.pack.sat.convertType.abType  d, a, b;\n\n    .convertType  = { .u16, .s16 }\n\n    .abType       = { .s32 }\n\ncvt.pack.sat.convertType.abType.cType  d, a, b, c;\n\n    .convertType  = { .u2, .s2, .u4, .s4, .u8, .s8 }\n\n    .abType       = { .s32 }\n\n    .cType        = { .b32 }\n\nDescription\n\nConvert two 32-bit integers a a... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt"
            };

        case "cvta":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta\" target=\"_blank\" rel=\"noopener noreferrer\">cvta <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: cvta</h1><section id=\"data-movement-and-conversion-instructions-cvta\">\n\n\n<p>Convert address from <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code>\nstate space to generic, or vice-versa. Take the generic address of a variable declared in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>),\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// convert const, global, local, or shared address to generic address\ncvta.space.size  p, a;        // source address in register a\ncvta.space.size  p, var;      // get generic address of var\ncvta.space.size  p, var+imm;  // generic address of var+offset\n\n// convert generic address to const, global, local, or shared address\ncvta.to.space.size  p, a;\n\n.space = { .const, .global, .local, .shared{::cta, ::cluster}, .param };\n.size  = { .u32, .u64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Convert a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a>\n(<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> address to a generic address, or vice-versa. The\nsource and destination addresses must be the same size. Use <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.u32.u64</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.u64.u32</span></code> to\ntruncate or zero-extend addresses.</p>\n<p>For variables declared in <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code>\nstate space, the generic address of the variable may be taken using <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code>. The source is either a\nregister or a variable defined in <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> memory\nwith an optional offset.</p>\n<p>When converting a generic address into a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> (<code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>), <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code>\naddress, the resulting address is undefined in cases where the generic address does not fall within\nthe address window of the specified state space. A program may use <code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep</span></code> to guard against\nsuch incorrect behavior.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, the address must belong to the space specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifier, otherwise the behavior is undefined. If no sub-qualifier\nis specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.const</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.to.const</span></code> introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.param</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.to.param</span></code> introduced in PTX ISA version 7.7.</p>\n<p><strong>Note:</strong> The current implementation does not allow generic pointers to <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code> space variables in\nprograms that contain pointers to constant buffers passed as kernel parameters.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.param</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta.to.param</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cvta.const.u32   ptr,cvar;\ncvta.local.u32   ptr,lptr;\ncvta.shared::cta.u32  p,As+4;\ncvta.shared::cluster.u32 ptr, As;\ncvta.to.global.u32  p,gptr;\ncvta.param.u64   ptr,pvar;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Convert address from .const, Kernel Function Parameters (.param), .global, .local, or .shared\n\nstate space to generic, or vice-versa. Take the generic address of a variable declared in\n\n.const, Kernel Function Parameters (.param),\n\n.global, .local, or .shared state space.\n\nSyntax\n\n// convert const, global, local, or shared address to generic address\n\ncvta.space.size  p, a;        // source address in register a\n\ncvta.space.size  p, var;      // get generic address of var\n\ncvta.space.size  p, var+imm;  // generic address of var+offset\n\n// convert generic address to const, global, local, or shared address\n\ncvta.to.space.size  p, a;\n\n.space = { .const, .global, .local, .shared{::cta, ::cluster}, .param };\n\n.size  = { .u32, .u64 };\n\nDescription\n\nConvert a const, Kernel Function Parameters\n\n(.param), global, local, or shared address to a generic address, or vice-versa. The\n\nsource and destination addresses must be the same size. Use cvt.u32.u64 or cvt.u64.u32 to\n\ntruncate or zero-extend addresses.\n\nFor variables declared in .const, Kernel Function Parameters (.param), .global, .local, or .shared\n\nstate space, the generic address of the variable may be taken using cvta. The source is either a\n\nregister or a variable defined in const, Kernel Function Parameters (.param), global, local, or shared memory\n\nwith an optional offset.\n\nWhen converting a generic address into a const, Kernel Function Parameters (.param), global, local, or shared\n\naddress, the resulting address is undefined in cases where the generic address does not fall within\n\nthe address window of the specified state space. A program may use isspacep to guard against\n\nsuch incorrect behavior.\n\nFor cvta with .shared state space, the address must belong to the space specified by\n\n::cta or ::cluster sub-qualifier, otherwise the behavior is undefined. If no sub-qualifier\n\nis specified with .shared state space, then ::cta is assumed by default.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\ncvta.const and cvta.to.const introduced in PTX ISA version 3.1.\n\ncvta.param and cvta.to.param introduced in PTX ISA version 7.7.\n\nNote: The current implementation does not allow generic pointers to const space variables in\n\nprograms that contain pointers to constant buffers passed as kernel parameters.\n\nSupport for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\ncvta requires sm_20 or higher.\n\ncvta.param and cvta.to.param requires sm_70 or higher.\n\nSub-qualifier ::cta requires sm_30 or higher.\n\nSub-qualifier ::cluster requires sm_90 or higher.\n\nExamples\n\ncvta.const.u32   ptr,cvar;\n\ncvta.local.u32   ptr,lptr;\n\ncvta.shared::cta.u32  p,As+4;\n\ncvta.shared::cluster.u32 ptr, As;\n\ncvta.to.global.u32  p,gptr;\n\ncvta.param.u64   ptr,pvar;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta"
            };

        case "discard":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard\" target=\"_blank\" rel=\"noopener noreferrer\">discard <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: discard</h1><section id=\"data-movement-and-conversion-instructions-discard\">\n\n\n<p>Invalidate the data in cache at the specified address and cache level.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>discard{.global}.level  [a], size;\n\n.level = { .L2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">discard</span></code> instruction invalidates the data at the address range <code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">..</span> <span class=\"pre\">a</span> <span class=\"pre\">+</span> <span class=\"pre\">(size</span> <span class=\"pre\">-</span> <span class=\"pre\">1)]</span></code> in\nthe cache level specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.level</span></code> qualifier without writing back the data in the cache to\nthe memory. Therefore after the discard operation, the data at the address range <code class=\"docutils literal notranslate\"><span class=\"pre\">[a</span> <span class=\"pre\">..</span> <span class=\"pre\">a+</span> <span class=\"pre\">(size</span> <span class=\"pre\">-</span>\n<span class=\"pre\">1)]</span></code> has undetermined value.</p>\n<p>The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> is an integer constant that specifies the amount of data, in bytes, in the\ncache level specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">.level</span></code> qualifier to be discarded. The only supported value for the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">size</span></code> operand is 128.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the specified address does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space\nthen the behavior is undefined.</p>\n<p>Supported addressing modes for address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be aligned to 128 bytes.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>discard.global.L2 [ptr], 128;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Invalidate the data in cache at the specified address and cache level.\n\nSyntax\n\ndiscard{.global}.level  [a], size;\n\n.level = { .L2 };\n\nDescription\n\nThe discard instruction invalidates the data at the address range [a .. a + (size - 1)] in\n\nthe cache level specified by the .level qualifier without writing back the data in the cache to\n\nthe memory. Therefore after the discard operation, the data at the address range [a .. a+ (size -\n\n1)] has undetermined value.\n\nThe operand size is an integer constant that specifies the amount of data, in bytes, in the\n\ncache level specified by the .level qualifier to be discarded. The only supported value for the\n\nsize operand is 128.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the specified address does not fall within the address window of .global state space\n\nthen the behavior is undefined.\n\nSupported addressing modes for address operand a are described in Addresses as Operands. a must be aligned to 128 bytes.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.4.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nExamples\n\ndiscard.global.L2 [ptr], 128;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-discard"
            };

        case "div":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div\" target=\"_blank\" rel=\"noopener noreferrer\">div(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div\" target=\"_blank\" rel=\"noopener noreferrer\">div(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: div</h1><section id=\"floating-point-instructions-div\">\n\n\n<p>Divide one value by another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>div.approx{.ftz}.f32  d, a, b;  // fast, approximate divide\ndiv.full{.ftz}.f32    d, a, b;  // full-range approximate divide\ndiv.rnd{.ftz}.f32     d, a, b;  // IEEE 754 compliant rounding\ndiv.rnd.f64           d, a, b;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Divides <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> by <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, stores result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a / b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><strong>Fast, approximate single-precision divides:</strong></p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.f32</span></code> implements a fast approximation to divide, computed as <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span> <span class=\"pre\">=</span> <span class=\"pre\">a</span> <span class=\"pre\">*</span> <span class=\"pre\">(1/b)</span></code>. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">|b|</span></code> in [2<sup>-126</sup>, 2<sup>126</sup>], the maximum <code class=\"docutils literal notranslate\"><span class=\"pre\">ulp</span></code> error is 2. For 2<sup>126</sup> &lt;\n<code class=\"docutils literal notranslate\"><span class=\"pre\">|b|</span></code> &lt; 2<sup>128</sup>, if <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is infinity, <code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.f32</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, otherwise it\nreturns 0.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.full.f32</span></code> implements a relatively fast, full-range approximation that scales operands to\nachieve better accuracy, but is not fully IEEE 754 compliant and does not support rounding\nmodifiers. The maximum <code class=\"docutils literal notranslate\"><span class=\"pre\">ulp</span></code> error is 2 across the full range of inputs.</p></li>\n<li><p>Subnormal inputs and results are flushed to sign-preserving zero. Fast, approximate division by\nzero creates a value of infinity (with same sign as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>).</p></li>\n</ul>\n<p><strong>Divide with IEEE 754 compliant rounding:</strong></p>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">div.f64</span></code> introduced in PTX ISA version 1.0.</p>\n<p>Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.full</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>, and rounding introduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, one of <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.full</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code> is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">div.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.ftz.f32</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">div.f64</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">div.rn.f64</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.approx.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">div.full.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.rnd.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.rn.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">map_f64_to_f32</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">div.{rz,rm,rp}.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>div.approx.ftz.f32  diam,circum,3.14159;\ndiv.full.ftz.f32    x, y, z;\ndiv.rn.f64          xd, yd, zd;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: div</h1><section id=\"integer-arithmetic-instructions-div\">\n\n\n<p>Divide one value by another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>div.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Divides <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> by <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, stores result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a / b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Division by zero yields an unspecified, machine-specific value.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>div.s32  b,n,i;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: div\n\n\n\nDivide one value by another.\n\nSyntax\n\ndiv.approx{.ftz}.f32  d, a, b;  // fast, approximate divide\n\ndiv.full{.ftz}.f32    d, a, b;  // full-range approximate divide\n\ndiv.rnd{.ftz}.f32     d, a, b;  // IEEE 754 compliant rounding\n\ndiv.rnd.f64           d, a, b;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nDivides a by b, stores result in d.\n\nSemantics\n\nd = a / b;\n\nNotes\n\nFast, a...\n\n=====Integer Arithmetic Instructions: div\n\n\n\nDivide one value by another.\n\nSyntax\n\ndiv.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nDivides a by b, stores result in d.\n\nSemantics\n\nd = a / b;\n\nNotes\n\nDivision by zero yields an unspecified, machine-specific value.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\ndiv.s32  b,n,i;\n\n... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div"
            };

        case "dp2a":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a\" target=\"_blank\" rel=\"noopener noreferrer\">dp2a(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: dp2a</h1><section id=\"integer-arithmetic-instructions-dp2a\">\n\n\n<p>Two-way dot product-accumulate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>dp2a.mode.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n.mode = { .lo, .hi };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit result.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are 32-bit inputs. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> holds two 16-bits inputs in packed form and\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> holds 4 byte inputs in packed form for dot product.</p>\n<p>Depending on the <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> specified, either lower half or upper half of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> will be used\nfor dot product.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> if both <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.btype</span></code> are <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> else operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nhas type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = c;\n// Extract two 16-bit values from a 32-bit input and sign or zero extend\n// based on input type.\nVa = extractAndSignOrZeroExt_2(a, .atype);\n\n// Extract four 8-bit values from a 32-bit input and sign or zer extend\n// based on input type.\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nb_select = (.mode == .lo) ? 0 : 2;\n\nfor (i = 0; i &lt; 2; ++i) {\n    d += Va[i] * Vb[b_select + i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>dp2a.lo.u32.u32           d0, a0, b0, c0;\ndp2a.hi.u32.s32           d1, a1, b1, c1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Two-way dot product-accumulate.\n\nSyntax\n\ndp2a.mode.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.mode = { .lo, .hi };\n\nDescription\n\nTwo-way 16-bit to 8-bit dot product which is accumulated in 32-bit result.\n\nOperand a and b are 32-bit inputs. Operand a holds two 16-bits inputs in packed form and\n\noperand b holds 4 byte inputs in packed form for dot product.\n\nDepending on the .mode specified, either lower half or upper half of operand b will be used\n\nfor dot product.\n\nOperand c has type .u32 if both .atype and .btype are .u32 else operand c\n\nhas type .s32.\n\nSemantics\n\nd = c;\n\n// Extract two 16-bit values from a 32-bit input and sign or zero extend\n\n// based on input type.\n\nVa = extractAndSignOrZeroExt_2(a, .atype);\n\n// Extract four 8-bit values from a 32-bit input and sign or zer extend\n\n// based on input type.\n\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nb_select = (.mode == .lo) ? 0 : 2;\n\nfor (i = 0; i < 2; ++i) {\n\n    d += Va[i] * Vb[b_select + i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\nRequires sm_61 or higher.\n\nExamples\n\ndp2a.lo.u32.u32           d0, a0, b0, c0;\n\ndp2a.hi.u32.s32           d1, a1, b1, c1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp2a"
            };

        case "dp4a":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a\" target=\"_blank\" rel=\"noopener noreferrer\">dp4a(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: dp4a</h1><section id=\"integer-arithmetic-instructions-dp4a\">\n\n\n<p>Four-way byte dot product-accumulate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>dp4a.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way byte dot product which is accumulated in 32-bit result.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are 32-bit inputs which hold 4 byte inputs in packed form for dot product.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> if both <code class=\"docutils literal notranslate\"><span class=\"pre\">.atype</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.btype</span></code> are <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> else operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nhas type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = c;\n\n// Extract 4 bytes from a 32bit input and sign or zero extend\n// based on input type.\nVa = extractAndSignOrZeroExt_4(a, .atype);\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nfor (i = 0; i &lt; 4; ++i) {\n    d += Va[i] * Vb[i];\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 5.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>dp4a.u32.u32           d0, a0, b0, c0;\ndp4a.u32.s32           d1, a1, b1, c1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Four-way byte dot product-accumulate.\n\nSyntax\n\ndp4a.atype.btype  d, a, b, c;\n\n.atype = .btype = { .u32, .s32 };\n\nDescription\n\nFour-way byte dot product which is accumulated in 32-bit result.\n\nOperand a and b are 32-bit inputs which hold 4 byte inputs in packed form for dot product.\n\nOperand c has type .u32 if both .atype and .btype are .u32 else operand c\n\nhas type .s32.\n\nSemantics\n\nd = c;\n\n// Extract 4 bytes from a 32bit input and sign or zero extend\n\n// based on input type.\n\nVa = extractAndSignOrZeroExt_4(a, .atype);\n\nVb = extractAndSignOrZeroExt_4(b, .btype);\n\nfor (i = 0; i < 4; ++i) {\n\n    d += Va[i] * Vb[i];\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 5.0.\n\nTarget ISA Notes\n\nRequires sm_61 or higher.\n\nExamples\n\ndp4a.u32.u32           d0, a0, b0, c0;\n\ndp4a.u32.s32           d1, a1, b1, c1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-dp4a"
            };

        case "dynamic_smem_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-dynamic-smem-size\" target=\"_blank\" rel=\"noopener noreferrer\">dynamic_smem_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %dynamic_smem_size</h1><section id=\"special-registers-dynamic-smem-size\">\n\n\n<p>Size of shared memory allocated dynamically at kernel launch.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %dynamic_smem_size;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Size of shared memory allocated dynamically at kernel launch.</p>\n<p>A predefined, read-only special register initialized with size of shared memory allocated dynamically for the CTA of a kernel at launch time.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %dynamic_smem_size;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Size of shared memory allocated dynamically at kernel launch.\n\nSyntax (predefined)\n\n.sreg .u32 %dynamic_smem_size;\n\nDescription\n\nSize of shared memory allocated dynamically at kernel launch.\n\nA predefined, read-only special register initialized with size of shared memory allocated dynamically for the CTA of a kernel at launch time.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %dynamic_smem_size;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-dynamic-smem-size"
            };

        case "elect":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync\" target=\"_blank\" rel=\"noopener noreferrer\">elect.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: elect.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-elect-sync\">\n\n\n<p>Elect a leader thread from a set of threads.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>elect.sync d|p, membermask;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">elect.sync</span></code> elects one predicated active leader thread from among a set of threads specified by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code> of the elected thread is returned in the 32-bit destination operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. The sink symbol \u2018_\u2019 can be used for destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. The predicate destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> for the leader thread, and <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> for all other threads.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer indicating the set of threads from which a leader\nis to be elected. The behavior is undefined if the executing thread is not in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p>Election of a leader thread happens deterministically, i.e. the same leader thread is elected for\nthe same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> every time.</p>\n<p>The mandatory <code class=\"docutils literal notranslate\"><span class=\"pre\">.sync</span></code> qualifier indicates that <code class=\"docutils literal notranslate\"><span class=\"pre\">elect</span></code> causes the executing thread to wait until\nall threads in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> execute the <code class=\"docutils literal notranslate\"><span class=\"pre\">elect</span></code> instruction before resuming execution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>elect.sync    %r0|%p0, 0xffffffff;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Elect a leader thread from a set of threads.\n\nSyntax\n\nelect.sync d|p, membermask;\n\nDescription\n\nelect.sync elects one predicated active leader thread from among a set of threads specified by\n\nmembermask. laneid of the elected thread is returned in the 32-bit destination operand\n\nd. The sink symbol \u2018_\u2019 can be used for destination operand d. The predicate destination\n\np is set to True for the leader thread, and False for all other threads.\n\nOperand membermask specifies a 32-bit integer indicating the set of threads from which a leader\n\nis to be elected. The behavior is undefined if the executing thread is not in membermask.\n\nElection of a leader thread happens deterministically, i.e. the same leader thread is elected for\n\nthe same membermask every time.\n\nThe mandatory .sync qualifier indicates that elect causes the executing thread to wait until\n\nall threads in the membermask execute the elect instruction before resuming execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nelect.sync    %r0|%p0, 0xffffffff;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync"
            };

        case "entry":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-entry\" target=\"_blank\" rel=\"noopener noreferrer\">entry <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Kernel and Function Directives: .entry</h1><section id=\"kernel-and-function-directives-entry\">\n\n\n<p>Kernel entry point and body, with optional parameters.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry kernel-name ( param-list )  kernel-body\n.entry kernel-name  kernel-body\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Defines a kernel entry point name, parameters, and body for the kernel function.</p>\n<p>Parameters are passed via <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space memory and are listed within an optional parenthesized\nparameter list. Parameters may be referenced by name within the kernel body and loaded into\nregisters using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code> instructions.</p>\n<p>In addition to normal parameters, opaque <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variables\nmay be passed as parameters. These parameters can only be referenced by name within texture and\nsurface load, store, and query instructions and cannot be accessed via <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code> instructions.</p>\n<p>The shape and size of the CTA executing the kernel are available in special registers.</p>\n<p><strong>Semantics</strong></p>\n<p>Specify the entry point for a kernel program.</p>\n<p>At kernel launch, the kernel dimensions and properties are established and made available via\nspecial registers, e.g., <code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">%nctaid</span></code>, etc.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>For PTX ISA version 1.4 and later, parameter variables are declared in the kernel parameter\nlist. For PTX ISA versions 1.0 through 1.3, parameter variables are declared in the kernel body.</p>\n<p>The maximum memory size supported by PTX for normal (non-opaque type) parameters is 32764\nbytes. Depending upon the PTX ISA version, the parameter size limit varies. The following table\nshows the allowed parameter size for a PTX ISA version:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 48%\"/>\n<col style=\"width: 52%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>PTX ISA Version</p></th>\n<th class=\"head\"><p>Maximum parameter size (In bytes)</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>PTX ISA version 8.1 and above</p></td>\n<td><p>32764</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>PTX ISA version 1.5 and above</p></td>\n<td><p>4352</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>PTX ISA version 1.4 and above</p></td>\n<td><p>256</p></td>\n</tr>\n</tbody>\n</table>\n<p>The CUDA and OpenCL drivers support the following limits for parameter memory:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 13%\"/>\n<col style=\"width: 88%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Driver</p></th>\n<th class=\"head\"><p>Parameter memory size</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>CUDA</p></td>\n<td><p>256 bytes for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code>, 4096 bytes for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_2x</span> <span class=\"pre\">and</span> <span class=\"pre\">higher</span></code>,\n32764 bytes fo <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>OpenCL</p></td>\n<td><p>32764 bytes for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher, 4352 bytes on <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>\nand lower</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry cta_fft\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n{\n    .reg .b32 %r&lt;99&gt;;\n    ld.param.b32  %r1, [x];\n    ld.param.b32  %r2, [y];\n    ld.param.b32  %r3, [z];\n    ...\n}\n\n.entry prefix_sum ( .param .align 4 .s32 pitch[8000] )\n{\n    .reg .s32 %t;\n    ld.param.s32  %t, [pitch];\n    ...\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "Kernel entry point and body, with optional parameters.\n\nSyntax\n\n.entry kernel-name ( param-list )  kernel-body\n\n.entry kernel-name  kernel-body\n\nDescription\n\nDefines a kernel entry point name, parameters, and body for the kernel function.\n\nParameters are passed via .param space memory and are listed within an optional parenthesized\n\nparameter list. Parameters may be referenced by name within the kernel body and loaded into\n\nregisters using ld.param instructions.\n\nIn addition to normal parameters, opaque .texref, .samplerref, and .surfref variables\n\nmay be passed as parameters. These parameters can only be referenced by name within texture and\n\nsurface load, store, and query instructions and cannot be accessed via ld.param instructions.\n\nThe shape and size of the CTA executing the kernel are available in special registers.\n\nSemantics\n\nSpecify the entry point for a kernel program.\n\nAt kernel launch, the kernel dimensions and properties are established and made available via\n\nspecial registers, e.g., %ntid, %nctaid, etc.\n\nPTX ISA Notes\n\nFor PTX ISA version 1.4 and later, parameter variables are declared in the kernel parameter\n\nlist. For PTX ISA versions 1.0 through 1.3, parameter variables are declared in the kernel body.\n\nThe maximum memory size supported by PTX for normal (non-opaque type) parameters is 32764\n\nbytes. Depending upon the PTX ISA version, the parameter size limit varies. The following table\n\nshows the allowed parameter size for a PTX ISA version:\n\n\n\nPTX ISA Version\n\nMaximum parameter size (In bytes)\n\nPTX ISA version 8.1 and above\n\n32764\n\nPTX ISA version 1.5 and above\n\n4352\n\nPTX ISA version 1.4 and above\n\n256\n\nThe CUDA and OpenCL drivers support the following limits for parameter memory:\n\n\n\nDriver\n\nParameter memory size\n\nCUDA\n\n256 bytes for sm_1x, 4096 bytes for sm_2x and higher,\n\n32764 bytes fo sm_70 and higher\n\nOpenCL\n\n32764 bytes for sm_70 and higher, 4352 bytes on sm_6x\n\nand lower\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry cta_fft\n\n.entry filter ( .param .b32 x, .param .b32 y, .param .b32 z )\n\n{\n\n    .reg .b32 %r<99>;\n\n    ld.param.b32  %r1, [x];\n\n    ld.param.b32  %r2, [y];\n\n    ld.param.b32  %r3, [z];\n\n    ...\n\n}\n\n.entry prefix_sum ( .param .align 4 .s32 pitch[8000] )\n\n{\n\n    .reg .s32 %t;\n\n    ld.param.s32  %t, [pitch];\n\n    ...\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-entry"
            };

        case "envreg<32>":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-envreg-32\" target=\"_blank\" rel=\"noopener noreferrer\">envreg<32> <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %envreg<32></h1><section id=\"special-registers-envreg-32\">\n\n\n<p>Driver-defined read-only registers.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .b32 %envreg&lt;32&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A set of 32 pre-defined read-only registers used to capture execution environment of PTX program\noutside of PTX virtual machine. These registers are initialized by the driver prior to kernel launch\nand can contain cta-wide or grid-wide values.</p>\n<p>Precise semantics of these registers is defined in the driver documentation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.b32      %r1,%envreg0;  // move envreg0 to %r1\n</pre></div>\n</div>\n</section>",
                "tooltip": "Driver-defined read-only registers.\n\nSyntax (predefined)\n\n.sreg .b32 %envreg<32>;\n\nDescription\n\nA set of 32 pre-defined read-only registers used to capture execution environment of PTX program\n\noutside of PTX virtual machine. These registers are initialized by the driver prior to kernel launch\n\nand can contain cta-wide or grid-wide values.\n\nPrecise semantics of these registers is defined in the driver documentation.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.b32      %r1,%envreg0;  // move envreg0 to %r1\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-envreg-32"
            };

        case "ex2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2\" target=\"_blank\" rel=\"noopener noreferrer\">ex2(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2\" target=\"_blank\" rel=\"noopener noreferrer\">ex2(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: ex2</h1><section id=\"floating-point-instructions-ex2\">\n\n\n<p>Find the base-2 exponential of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ex2.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Raise 2 to the power <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = 2 ^ a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.f32</span></code> implements a fast approximation to 2<sup>a</sup>.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-22.5</sup> for fraction in the primary range.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ex2.approx.ftz.f32  xa, a;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: ex2</h1><section id=\"half-precision-floating-point-instructions-ex2\">\n\n\n<p>Find the base-2 exponent of input.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ex2.approx.atype     d, a;\nex2.approx.ftz.btype d, a;\n\n.atype = { .f16,  .f16x2}\n.btype = { .bf16, .bf16x2}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Raise 2 to the power <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>The type of operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are as specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, each of the half-word operands are operated in\nparallel and the results are packed appropriately into a <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (.type == .f16 || .type == .bf16) {\n  d = 2 ^ a\n} else if (.type == .f16x2 || .type == .bf16x2) {\n  fA[0] = a[0:15];\n  fA[1] = a[16:31];\n  d[0] = 2 ^ fA[0]\n  d[1] = 2 ^ fA[1]\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> implement a fast approximation to 2<sup>a</sup>.</p>\n<p>For the <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type, subnormal inputs are supported. <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.bf16</span></code> flushes subnormal\ninputs and results to sign-preserving zero.</p>\n<p>Results of <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.bf16</span></code> for various corner-case inputs are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>Results of <code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.f16</span></code> for various corner-case inputs are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 45%\"/>\n<col style=\"width: 55%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+0.0</p></td>\n<td><p>+1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum relative error for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type is 2-9.9. The maximum relative error for <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> type\nis 2-7.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.{bf16/bf16x2}</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ex2.approx.ftz.{bf16/bf16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ex2.approx.f16         h1, h0;\nex2.approx.f16x2       hd1, hd0;\nex2.approx.ftz.bf16    b1, b2;\nex2.approx.ftz.bf16x2  hb1, hb2;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: ex2\n\n\n\nFind the base-2 exponential of a value.\n\nSyntax\n\nex2.approx{.ftz}.f32  d, a;\n\nDescription\n\nRaise 2 to the power a.\n\nSemantics\n\nd = 2 ^ a;\n\nNotes\n\nex2.approx.f32 implements a fast approximation to 2a.\n\n\n\nInput\n\nResult\n\n-Inf\n\n+0.0\n\n-subnormal\n\n+1.0\n\n-0.0\n\n+1.0\n\n+0.0\n\n+1.0\n\n+subnormal\n\n+1.0\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-22.5 for fraction in the primary range.\n\nSubnormal numbers:\n\nsm_20+By default, subno...\n\n=====Half Precision Floating Point Instructions: ex2\n\n\n\nFind the base-2 exponent of input.\n\nSyntax\n\nex2.approx.atype     d, a;\n\nex2.approx.ftz.btype d, a;\n\n.atype = { .f16,  .f16x2}\n\n.btype = { .bf16, .bf16x2}\n\nDescription\n\nRaise 2 to the power a.\n\nThe type of operands d and a are as specified by .type.\n\nFor .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\n\nparallel and the results are packed appropriately into a .f16x2 or .bf16... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2"
            };

        case "exit":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit\" target=\"_blank\" rel=\"noopener noreferrer\">exit <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: exit</h1><section id=\"control-flow-instructions-exit\">\n\n\n<p>Terminate a thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>exit;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Ends execution of a thread.</p>\n<p>As threads exit, barriers waiting on all threads are checked to see if the exiting threads are the\nonly threads that have not yet made it to a barrier{.cta} for all threads in the CTA. If the exiting threads are holding up the\nbarrier, the barrier is released.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    exit;\n@p  exit;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Terminate a thread.\n\nSyntax\n\nexit;\n\nDescription\n\nEnds execution of a thread.\n\nAs threads exit, barriers waiting on all threads are checked to see if the exiting threads are the\n\nonly threads that have not yet made it to a barrier{.cta} for all threads in the CTA. If the exiting threads are holding up the\n\nbarrier, the barrier is released.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n    exit;\n\n@p  exit;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-exit"
            };

        case "explicitcluster":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-explicitcluster\" target=\"_blank\" rel=\"noopener noreferrer\">explicitcluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Cluster Dimension Directives: .explicitcluster</h1><section id=\"cluster-dimension-directives-explicitcluster\">\n\n\n<p>Declare that Kernel must be launched with cluster dimensions explicitly specified.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.explicitcluster\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares that this Kernel should be launched with cluster dimension explicitly specified.</p>\n<p><strong>Semantics</strong></p>\n<p>Kernels with <code class=\"docutils literal notranslate\"><span class=\"pre\">.explicitcluster</span></code> directive must be launched with cluster dimension explicitly\nspecified (either at launch time or via <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqnctapercluster</span></code>), otherwise program will fail with\nruntime error or kernel launch failure.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo .explicitcluster         { . . . }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare that Kernel must be launched with cluster dimensions explicitly specified.\n\nSyntax\n\n.explicitcluster\n\nDescription\n\nDeclares that this Kernel should be launched with cluster dimension explicitly specified.\n\nSemantics\n\nKernels with .explicitcluster directive must be launched with cluster dimension explicitly\n\nspecified (either at launch time or via .reqnctapercluster), otherwise program will fail with\n\nruntime error or kernel launch failure.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo .explicitcluster         { . . . }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-explicitcluster"
            };

        case "extern":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-extern\" target=\"_blank\" rel=\"noopener noreferrer\">extern <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Linking Directives: .extern</h1><section id=\"linking-directives-extern\">\n\n\n<p>External symbol declaration.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.extern identifier\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares identifier to be defined external to the current module. The identifier must be declared\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.visible</span></code> in the module where it is defined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.extern .global .b32 foo;  // foo is defined in another module\n</pre></div>\n</div>\n</section>",
                "tooltip": "External symbol declaration.\n\nSyntax\n\n.extern identifier\n\nDescription\n\nDeclares identifier to be defined external to the current module. The identifier must be declared\n\n.visible in the module where it is defined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.extern .global .b32 foo;  // foo is defined in another module\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-extern"
            };

        case "file":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file\" target=\"_blank\" rel=\"noopener noreferrer\">file <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Debugging Directives: .file</h1><section id=\"debugging-directives-file\">\n\n\n<p>Source file name.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.file file_index \"filename\" {, timestamp, file_size}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Associates a source filename with an integer index. <code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> directives reference source files by\nindex.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.file</span></code> directive allows optionally specifying an unsigned number representing time of last\nmodification and an unsigned integer representing size in bytes of source file. <code class=\"docutils literal notranslate\"><span class=\"pre\">timestamp</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">file_size</span></code> value can be 0 to indicate this information is not available.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">timestamp</span></code> value is in format of C and C++ data type <code class=\"docutils literal notranslate\"><span class=\"pre\">time_t</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">file_size</span></code> is an unsigned 64-bit integer.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.file</span></code> directive is allowed only in the outermost scope, i.e., at the same level as kernel\nand device function declarations.</p>\n<p><strong>Semantics</strong></p>\n<p>If timestamp and file size are not specified, they default to 0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Timestamp and file size introduced in PTX ISA version 3.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.file 1 \"example.cu\"\n.file 2 \"kernel.cu\"\n.file 1 \u201ckernel.cu\u201d, 1339013327, 64118\n</pre></div>\n</div>\n</section>",
                "tooltip": "Source file name.\n\nSyntax\n\n.file file_index \"filename\" {, timestamp, file_size}\n\nDescription\n\nAssociates a source filename with an integer index. .loc directives reference source files by\n\nindex.\n\n.file directive allows optionally specifying an unsigned number representing time of last\n\nmodification and an unsigned integer representing size in bytes of source file. timestamp and\n\nfile_size value can be 0 to indicate this information is not available.\n\ntimestamp value is in format of C and C++ data type time_t.\n\nfile_size is an unsigned 64-bit integer.\n\nThe .file directive is allowed only in the outermost scope, i.e., at the same level as kernel\n\nand device function declarations.\n\nSemantics\n\nIf timestamp and file size are not specified, they default to 0.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTimestamp and file size introduced in PTX ISA version 3.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.file 1 \"example.cu\"\n\n.file 2 \"kernel.cu\"\n\n.file 1 \u201ckernel.cu\u201d, 1339013327, 64118\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-file"
            };

        case "fma":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma\" target=\"_blank\" rel=\"noopener noreferrer\">fma(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-fma\" target=\"_blank\" rel=\"noopener noreferrer\">fma(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: fma</h1><section id=\"floating-point-instructions-fma\">\n\n\n<p>Fused multiply-add.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>fma.rnd{.ftz}{.sat}.f32  d, a, b, c;\nfma.rnd.f64              d, a, b, c;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs a fused multiply-add with no loss of precision in the intermediate product and addition.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a*b + c;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to single precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code>.</p>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> is unimplemented for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets.</p>\n</dd>\n</dl>\n<p>Saturation:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> introduced in PTX ISA version 1.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    fma.rn.ftz.f32  w,x,y,z;\n@p  fma.rn.f64      d,a,b,c;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: fma</h1><section id=\"half-precision-floating-point-instructions-fma\">\n\n\n<p>Fused multiply-add</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>fma.rnd{.ftz}{.sat}.f16     d, a, b, c;\nfma.rnd{.ftz}{.sat}.f16x2   d, a, b, c;\nfma.rnd{.ftz}.relu.f16      d, a, b, c;\nfma.rnd{.ftz}.relu.f16x2    d, a, b, c;\nfma.rnd{.relu}.bf16         d, a, b, c;\nfma.rnd{.relu}.bf16x2       d, a, b, c;\nfma.rnd.oob.{relu}.type     d, a, b, c;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs a fused multiply-add with no loss of precision in the intermediate product and addition.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then operated in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\nresult in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>\ntype. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>\ntype. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a * b + c;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    fC[0] = c[0:15];\n    fC[1] = c[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] * fB[i] + fC[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers (default is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fma.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">fma.relu.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> clamps the result to 0 if negative. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> result is\nconverted to canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n</dd>\n<dt>Out Of Bounds modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.oob.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> clamps the result to 0 if either of the operands\nis <code class=\"docutils literal notranslate\"><span class=\"pre\">OOB</span> <span class=\"pre\">NaN</span></code> (defined under <a class=\"reference external\" href=\"#tensors\">Tensors</a>) value. The test for the special <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> value\nand resultant forcing of the result to +0.0 is performed independently for each of the\ntwo SIMD operations.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.relu.{f16,</span> <span class=\"pre\">f16x2}</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fma{.relu}.{bf16,</span> <span class=\"pre\">bf16x2}</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Support for modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.oob</span></code> introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma.relu.{f16,</span> <span class=\"pre\">f16x2}</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fma{.relu}.{bf16,</span> <span class=\"pre\">bf16x2}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fma{.oob}.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// scalar f16 fused multiply-add\nfma.rn.f16         d0, a0, b0, c0;\nfma.rn.f16         d1, a1, b1, c1;\nfma.rn.relu.f16    d1, a1, b1, c1;\nfma.rn.oob.f16      d1, a1, b1, c1;\nfma.rn.oob.relu.f16 d1, a1, b1, c1;\n\n// scalar bf16 fused multiply-add\nfma.rn.bf16        d1, a1, b1, c1;\nfma.rn.relu.bf16   d1, a1, b1, c1;\nfma.rn.oob.bf16       d1, a1, b1, c1;\nfma.rn.oob.relu.bf16  d1, a1, b1, c1;\n\n// SIMD f16 fused multiply-add\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1}; // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3}; // pack two f16 to 32bit f16x2\nfma.rn.f16x2  p3, p1, p2, p2;   // SIMD f16x2 fused multiply-add\nfma.rn.relu.f16x2  p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with relu saturation mode\nfma.rn.oob.f16x2  p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with oob modifier\nfma.rn.oob.relu.f16x2 p3, p1, p2, p2; // SIMD f16x2 fused multiply-add with oob modifier and relu saturation mode\n\n// SIMD fp16 fused multiply-add\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nfma.rn.f16x2    f2, f0, f1, f1; // SIMD f16x2 fused multiply-add\n\n// SIMD bf16 fused multiply-add\nfma.rn.bf16x2       f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add\nfma.rn.relu.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with relu saturation mode\nfma.rn.oob.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with oob modifier\nfma.rn.oob.relu.bf16x2  f2, f0, f1, f1; // SIMD bf16x2 fused multiply-add with oob modifier and relu saturation mode\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: fma\n\n\n\nFused multiply-add.\n\nSyntax\n\nfma.rnd{.ftz}{.sat}.f32  d, a, b, c;\n\nfma.rnd.f64              d, a, b, c;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms a fused multiply-add with no loss of precision in the intermediate product and addition.\n\nSemantics\n\nd = a*b + c;\n\nNotes\n\nfma.f32 computes the product of a and b to infinite precision and then adds c to\n\nthis product, again in infinite precision. The r...\n\n=====Half Precision Floating Point Instructions: fma\n\n\n\nFused multiply-add\n\nSyntax\n\nfma.rnd{.ftz}{.sat}.f16     d, a, b, c;\n\nfma.rnd{.ftz}{.sat}.f16x2   d, a, b, c;\n\nfma.rnd{.ftz}.relu.f16      d, a, b, c;\n\nfma.rnd{.ftz}.relu.f16x2    d, a, b, c;\n\nfma.rnd{.relu}.bf16         d, a, b, c;\n\nfma.rnd{.relu}.bf16x2       d, a, b, c;\n\nfma.rnd.oob.{relu}.type     d, a, b, c;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms a fused multiply-add with no loss of precision in the int... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma"
            };

        case "fns":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns\" target=\"_blank\" rel=\"noopener noreferrer\">fns(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: fns</h1><section id=\"integer-arithmetic-instructions-fns\">\n\n\n<p>Find the n-th set bit</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>fns.b32 d, mask, base, offset;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Given a 32-bit value <code class=\"docutils literal notranslate\"><span class=\"pre\">mask</span></code> and an integer value <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> (between 0 and 31), find the n-th (given\nby offset) set bit in <code class=\"docutils literal notranslate\"><span class=\"pre\">mask</span></code> from the <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> bit, and store the bit position in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. If not\nfound, store 0xffffffff in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mask</span></code> has a 32-bit type. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>\ntype. Operand offset has <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32.</span></code></p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">base</span></code> must be &lt;= 31, otherwise behavior is undefined.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = 0xffffffff;\nif (offset == 0) {\n    if (mask[base] == 1) {\n        d = base;\n    }\n} else {\n    pos = base;\n    count = |offset| - 1;\n    inc = (offset &gt; 0) ? 1 : -1;\n\n    while ((pos &gt;= 0) &amp;&amp; (pos &lt; 32)) {\n        if (mask[pos] == 1) {\n            if (count == 0) {\n              d = pos;\n              break;\n           } else {\n               count = count \u2013 1;\n           }\n        }\n        pos = pos + inc;\n    }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fns</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>fns.b32 d, 0xaaaaaaaa, 3, 1;   // d = 3\nfns.b32 d, 0xaaaaaaaa, 3, -1;  // d = 3\nfns.b32 d, 0xaaaaaaaa, 2, 1;   // d = 3\nfns.b32 d, 0xaaaaaaaa, 2, -1;  // d = 1\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the n-th set bit\n\nSyntax\n\nfns.b32 d, mask, base, offset;\n\nDescription\n\nGiven a 32-bit value mask and an integer value base (between 0 and 31), find the n-th (given\n\nby offset) set bit in mask from the base bit, and store the bit position in d. If not\n\nfound, store 0xffffffff in d.\n\nOperand mask has a 32-bit type. Operand base has .b32, .u32 or .s32\n\ntype. Operand offset has .s32 type. Destination d has type .b32.\n\nOperand base must be <= 31, otherwise behavior is undefined.\n\nSemantics\n\nd = 0xffffffff;\n\nif (offset == 0) {\n\n    if (mask[base] == 1) {\n\n        d = base;\n\n    }\n\n} else {\n\n    pos = base;\n\n    count = |offset| - 1;\n\n    inc = (offset > 0) ? 1 : -1;\n\n    while ((pos >= 0) && (pos < 32)) {\n\n        if (mask[pos] == 1) {\n\n            if (count == 0) {\n\n              d = pos;\n\n              break;\n\n           } else {\n\n               count = count \u2013 1;\n\n           }\n\n        }\n\n        pos = pos + inc;\n\n    }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nfns requires sm_30 or higher.\n\nExamples\n\nfns.b32 d, 0xaaaaaaaa, 3, 1;   // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 3, -1;  // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 2, 1;   // d = 3\n\nfns.b32 d, 0xaaaaaaaa, 2, -1;  // d = 1\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns"
            };

        case "func":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-func\" target=\"_blank\" rel=\"noopener noreferrer\">func <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Kernel and Function Directives: .func</h1><section id=\"kernel-and-function-directives-func\">\n\n\n<p>Function definition.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.func {.attribute(attr-list)} fname {.noreturn} function-body\n.func {.attribute(attr-list)} fname (param-list) {.noreturn} function-body\n.func {.attribute(attr-list)} (ret-param) fname (param-list) function-body\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Defines a function, including input and return parameters and optional function body.</p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive indicates that the function does not return to the caller\nfunction. <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive cannot be specified on functions which have return parameters. See\nthe description of <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive in <a class=\"reference external\" href=\"#performance-tuning-directives-noreturn\">Performance-Tuning Directives: .noreturn</a>.</p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.attribute</span></code> directive specifies additional information associated with the\nfunction. See the description of <a class=\"reference external\" href=\"#variable-and-function-attribute-directive-attribute\">Variable and Function Attribute Directive: .attribute</a> for allowed attributes.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">.func</span></code> definition with no body provides a function prototype.</p>\n<p>The parameter lists define locally-scoped variables in the function body. Parameters must be base\ntypes in either the register or parameter state space. Parameters in register state space may be\nreferenced directly within instructions in the function body. Parameters in <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> space are\naccessed using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">st.param</span></code> instructions in the body. Parameter passing is\ncall-by-value.</p>\n<p>The last parameter in the parameter list may be a <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> array of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b8</span></code> with no size\nspecified. It is used to pass an arbitrary number of parameters to the function packed into a single\narray object.</p>\n<p>When calling a function with such an unsized last argument, the last argument may be omitted from\nthe <code class=\"docutils literal notranslate\"><span class=\"pre\">call</span></code> instruction if no parameter is passed through it. Accesses to this array parameter must\nbe within the bounds of the array. The result of an access is undefined if no array was passed, or\nif the access was outside the bounds of the actual array being passed.</p>\n<p><strong>Semantics</strong></p>\n<p>The PTX syntax hides all details of the underlying calling convention and ABI.</p>\n<p>The implementation of parameter passing is left to the optimizing translator, which may use a\ncombination of registers and stack locations to pass parameters.</p>\n<p><strong>Release Notes</strong></p>\n<p>For PTX ISA version 1.x code, parameters must be in the register state space, there is no stack, and\nrecursion is illegal.</p>\n<p>PTX ISA versions 2.0 and later with target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher allow parameters in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code>\nstate space, implements an ABI with stack, and supports recursion.</p>\n<p>PTX ISA versions 2.0 and later with target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher support at most one return value.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Support for unsized array parameter introduced in PTX ISA version 6.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive introduced in PTX ISA version 6.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.attribute</span></code> directive introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Functions without unsized array parameter supported on all target architectures.</p>\n<p>Unsized array parameter requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.attribute</span></code> directive requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.func (.reg .b32 rval) foo (.reg .b32 N, .reg .f64 dbl)\n{\n.reg .b32 localVar;\n\n... use N, dbl;\nother code;\n\nmov.b32 rval,result;\nret;\n}\n\n...\ncall (fooval), foo, (val0, val1);  // return value in fooval\n...\n\n.func foo (.reg .b32 N, .reg .f64 dbl) .noreturn\n{\n.reg .b32 localVar;\n... use N, dbl;\nother code;\nmov.b32 rval, result;\nret;\n}\n...\ncall foo, (val0, val1);\n...\n\n.func (.param .u32 rval) bar(.param .u32 N, .param .align 4 .b8 numbers[])\n{\n    .reg .b32 input0, input1;\n    ld.param.b32   input0, [numbers + 0];\n    ld.param.b32   input1, [numbers + 4];\n    ...\n    other code;\n    ret;\n}\n...\n\n.param .u32 N;\n.param .align 4 .b8 numbers[8];\nst.param.u32    [N], 2;\nst.param.b32    [numbers + 0], 5;\nst.param.b32    [numbers + 4], 10;\ncall (rval), bar, (N, numbers);\n...\n</pre></div>\n</div>\n</section>",
                "tooltip": "Function definition.\n\nSyntax\n\n.func {.attribute(attr-list)} fname {.noreturn} function-body\n\n.func {.attribute(attr-list)} fname (param-list) {.noreturn} function-body\n\n.func {.attribute(attr-list)} (ret-param) fname (param-list) function-body\n\nDescription\n\nDefines a function, including input and return parameters and optional function body.\n\nAn optional .noreturn directive indicates that the function does not return to the caller\n\nfunction. .noreturn directive cannot be specified on functions which have return parameters. See\n\nthe description of .noreturn directive in Performance-Tuning Directives: .noreturn.\n\nAn optional .attribute directive specifies additional information associated with the\n\nfunction. See the description of Variable and Function Attribute Directive: .attribute for allowed attributes.\n\nA .func definition with no body provides a function prototype.\n\nThe parameter lists define locally-scoped variables in the function body. Parameters must be base\n\ntypes in either the register or parameter state space. Parameters in register state space may be\n\nreferenced directly within instructions in the function body. Parameters in .param space are\n\naccessed using ld.param and st.param instructions in the body. Parameter passing is\n\ncall-by-value.\n\nThe last parameter in the parameter list may be a .param array of type .b8 with no size\n\nspecified. It is used to pass an arbitrary number of parameters to the function packed into a single\n\narray object.\n\nWhen calling a function with such an unsized last argument, the last argument may be omitted from\n\nthe call instruction if no parameter is passed through it. Accesses to this array parameter must\n\nbe within the bounds of the array. The result of an access is undefined if no array was passed, or\n\nif the access was outside the bounds of the actual array being passed.\n\nSemantics\n\nThe PTX syntax hides all details of the underlying calling convention and ABI.\n\nThe implementation of parameter passing is left to the optimizing translator, which may use a\n\ncombination of registers and stack locations to pass parameters.\n\nRelease Notes\n\nFor PTX ISA version 1.x code, parameters must be in the register state space, there is no stack, and\n\nrecursion is illegal.\n\nPTX ISA versions 2.0 and later with target sm_20 or higher allow parameters in the .param\n\nstate space, implements an ABI with stack, and supports recursion.\n\nPTX ISA versions 2.0 and later with target sm_20 or higher support at most one return value.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nSupport for unsized array parameter introduced in PTX ISA version 6.0.\n\nSupport for .noreturn directive introduced in PTX ISA version 6.4.\n\nSupport for .attribute directive introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nFunctions without unsized array parameter supported on all target architectures.\n\nUnsized array parameter requires sm_30 or higher.\n\n.noreturn directive requires sm_30 or higher.\n\n.attribute directive requires sm_90 or higher.\n\nExamples\n\n.func (.reg .b32 rval) foo (.reg .b32 N, .reg .f64 dbl)\n\n{\n\n.reg .b32 localVar;\n\n... use N, dbl;\n\nother code;\n\nmov.b32 rval,result;\n\nret;\n\n}\n\n...\n\ncall (fooval), foo, (val0, val1);  // return value in fooval\n\n...\n\n.func foo (.reg .b32 N, .reg .f64 dbl) .noreturn\n\n{\n\n.reg .b32 localVar;\n\n... use N, dbl;\n\nother code;\n\nmov.b32 rval, result;\n\nret;\n\n}\n\n...\n\ncall foo, (val0, val1);\n\n...\n\n.func (.param .u32 rval) bar(.param .u32 N, .param .align 4 .b8 numbers[])\n\n{\n\n    .reg .b32 input0, input1;\n\n    ld.param.b32   input0, [numbers + 0];\n\n    ld.param.b32   input1, [numbers + 4];\n\n    ...\n\n    other code;\n\n    ret;\n\n}\n\n...\n\n.param .u32 N;\n\n.param .align 4 .b8 numbers[8];\n\nst.param.u32    [N], 2;\n\nst.param.b32    [numbers + 0], 5;\n\nst.param.b32    [numbers + 4], 10;\n\ncall (rval), bar, (N, numbers);\n\n...\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-and-function-directives-func"
            };

        case "getctarank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank\" target=\"_blank\" rel=\"noopener noreferrer\">getctarank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: getctarank</h1><section id=\"data-movement-and-conversion-instructions-getctarank\">\n\n\n<p>Generate the CTA rank of the address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>getctarank{.space}.type d, a;\n\n// Get cta rank from source shared memory address in register a.\ngetctarank.shared::cluster.type d, a;\n\n// Get cta rank from shared memory variable.\ngetctarank.shared::cluster.type d, var;\n\n// Get cta rank from shared memory variable+offset.\ngetctarank.shared::cluster.type d, var + imm;\n\n// Get cta rank from generic address of shared memory variable in register a.\ngetctarank.type d, a;\n\n.space = { .shared::cluster }\n.type  = { .u32, .u64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write the destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the rank of the CTA which contains the address specified\nin operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code> indicates the type of source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When space is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, source <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is either a shared memory variable or a register\ncontaining a valid shared memory address. When the optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.space</span></code> is not specified,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a register containing a generic addresses pointing to shared memory. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is\nalways a 32-bit register which holds the rank of the CTA.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>getctarank.shared::cluster.u32 d1, addr;\ngetctarank.shared::cluster.u64 d2, sh + 4;\ngetctarank.u64                 d3, src;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Generate the CTA rank of the address.\n\nSyntax\n\ngetctarank{.space}.type d, a;\n\n// Get cta rank from source shared memory address in register a.\n\ngetctarank.shared::cluster.type d, a;\n\n// Get cta rank from shared memory variable.\n\ngetctarank.shared::cluster.type d, var;\n\n// Get cta rank from shared memory variable+offset.\n\ngetctarank.shared::cluster.type d, var + imm;\n\n// Get cta rank from generic address of shared memory variable in register a.\n\ngetctarank.type d, a;\n\n.space = { .shared::cluster }\n\n.type  = { .u32, .u64 }\n\nDescription\n\nWrite the destination register d with the rank of the CTA which contains the address specified\n\nin operand a.\n\nInstruction type .type indicates the type of source operand a.\n\nWhen space is .shared::cluster, source a is either a shared memory variable or a register\n\ncontaining a valid shared memory address. When the optional qualifier .space is not specified,\n\na is a register containing a generic addresses pointing to shared memory. Destination d is\n\nalways a 32-bit register which holds the rank of the CTA.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\ngetctarank.shared::cluster.u32 d1, addr;\n\ngetctarank.shared::cluster.u64 d2, sh + 4;\n\ngetctarank.u64                 d3, src;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank"
            };

        case "globaltimer":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi\" target=\"_blank\" rel=\"noopener noreferrer\">globaltimer <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi</h1><section id=\"special-registers-globaltimer-globaltimer-lo-globaltimer-hi\">\n<span id=\"special-registers-globaltimer\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer</span></code></dt><dd><p>A predefined, 64-bit global nanosecond timer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_lo</span></code></dt><dd><p>The lower 32-bits of %globaltimer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_hi</span></code></dt><dd><p>The upper 32-bits of %globaltimer.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u64  r1,%globaltimer;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%globaltimerA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_loThe lower 32-bits of %globaltimer.\n\n%globaltimer_hiThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64  r1,%globaltimer;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi"
            };

        case "globaltimer_hi":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi\" target=\"_blank\" rel=\"noopener noreferrer\">globaltimer_hi <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi</h1><section id=\"special-registers-globaltimer-globaltimer-lo-globaltimer-hi\">\n<span id=\"special-registers-globaltimer\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer</span></code></dt><dd><p>A predefined, 64-bit global nanosecond timer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_lo</span></code></dt><dd><p>The lower 32-bits of %globaltimer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_hi</span></code></dt><dd><p>The upper 32-bits of %globaltimer.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u64  r1,%globaltimer;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%globaltimerA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_loThe lower 32-bits of %globaltimer.\n\n%globaltimer_hiThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64  r1,%globaltimer;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi"
            };

        case "globaltimer_lo":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi\" target=\"_blank\" rel=\"noopener noreferrer\">globaltimer_lo <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %globaltimer, %globaltimer_lo, %globaltimer_hi</h1><section id=\"special-registers-globaltimer-globaltimer-lo-globaltimer-hi\">\n<span id=\"special-registers-globaltimer\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer</span></code></dt><dd><p>A predefined, 64-bit global nanosecond timer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_lo</span></code></dt><dd><p>The lower 32-bits of %globaltimer.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%globaltimer_hi</span></code></dt><dd><p>The upper 32-bits of %globaltimer.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %globaltimer;\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers intended for use by NVIDIA tools. The behavior is target-specific and may change\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\nunspecified.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u64  r1,%globaltimer;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%globaltimerA predefined, 64-bit global nanosecond timer.\n\n%globaltimer_loThe lower 32-bits of %globaltimer.\n\n%globaltimer_hiThe upper 32-bits of %globaltimer.\n\nSyntax (predefined)\n\n.sreg .u64 %globaltimer;\n\n.sreg .u32 %globaltimer_lo, %globaltimer_hi;\n\nDescription\n\nSpecial registers intended for use by NVIDIA tools. The behavior is target-specific and may change\n\nor be removed in future GPUs. When JIT-compiled to other targets, the value of these registers is\n\nunspecified.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires target sm_30 or higher.\n\nExamples\n\nmov.u64  r1,%globaltimer;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-globaltimer-globaltimer-lo-globaltimer-hi"
            };

        case "griddepcontrol":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol\" target=\"_blank\" rel=\"noopener noreferrer\">griddepcontrol <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: griddepcontrol</h1><section id=\"parallel-synchronization-and-communication-instructions-griddepcontrol\">\n\n\n<p>Control execution of dependent grids.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>griddepcontrol.action;\n\n.action   = { .launch_dependents, .wait }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">griddepcontrol</span></code> instruction allows the dependent grids and prerequisite grids as defined by\nthe runtime, to control execution in the following way:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.launch_dependents</span></code> modifier signals that specific dependents the runtime system designated to\nreact to this instruction can be scheduled as soon as all other CTAs in the grid issue the same\ninstruction or have completed. The dependent may launch before the completion of the current\ngrid. There is no guarantee that the dependent will launch before the completion of the current\ngrid. Repeated invocations of this instruction by threads in the current CTA will have no additional\nside effects past that of the first invocation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.wait</span></code> modifier causes the executing thread to wait until all prerequisite grids in flight have\ncompleted and all the memory operations from the prerequisite grids are performed and made visible\nto the current grid.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>If the prerequisite grid is using <code class=\"docutils literal notranslate\"><span class=\"pre\">griddepcontrol.launch_dependents</span></code>, then the dependent grid\nmust use <code class=\"docutils literal notranslate\"><span class=\"pre\">griddepcontrol.wait</span></code> to ensure correct functional execution.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>griddepcontrol.launch_dependents;\ngriddepcontrol.wait;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Control execution of dependent grids.\n\nSyntax\n\ngriddepcontrol.action;\n\n.action   = { .launch_dependents, .wait }\n\nDescription\n\nThe griddepcontrol instruction allows the dependent grids and prerequisite grids as defined by\n\nthe runtime, to control execution in the following way:\n\n.launch_dependents modifier signals that specific dependents the runtime system designated to\n\nreact to this instruction can be scheduled as soon as all other CTAs in the grid issue the same\n\ninstruction or have completed. The dependent may launch before the completion of the current\n\ngrid. There is no guarantee that the dependent will launch before the completion of the current\n\ngrid. Repeated invocations of this instruction by threads in the current CTA will have no additional\n\nside effects past that of the first invocation.\n\n.wait modifier causes the executing thread to wait until all prerequisite grids in flight have\n\ncompleted and all the memory operations from the prerequisite grids are performed and made visible\n\nto the current grid.\n\nNote\n\nIf the prerequisite grid is using griddepcontrol.launch_dependents, then the dependent grid\n\nmust use griddepcontrol.wait to ensure correct functional execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\ngriddepcontrol.launch_dependents;\n\ngriddepcontrol.wait;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol"
            };

        case "gridid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-gridid\" target=\"_blank\" rel=\"noopener noreferrer\">gridid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %gridid</h1><section id=\"special-registers-gridid\">\n\n\n<p>Grid identifier.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %gridid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the per-grid temporal grid identifier. The\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code> is used by debuggers to distinguish CTAs and clusters within concurrent (small) grids.</p>\n<p>During execution, repeated launches of programs may occur, where each launch starts a\ngrid-of-CTAs. This variable provides the temporal grid launch number for this context.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets, <code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code> is limited to the range [0..2<sup>16</sup>-1]. For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code> is limited to the range [0..2<sup>32</sup>-1]. <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> supports the entire 64-bit range.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> in PTX ISA version 1.3.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> in PTX ISA version 3.0.</p>\n<p>For compatibility with legacy PTX code, 16-bit and 32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be\nused to read the lower 16-bits or 32-bits of each component of <code class=\"docutils literal notranslate\"><span class=\"pre\">%gridid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u64  %s, %gridid;  // 64-bit read of %gridid\nmov.u32  %r, %gridid;  // legacy code with 32-bit %gridid\n</pre></div>\n</div>\n</section>",
                "tooltip": "Grid identifier.\n\nSyntax (predefined)\n\n.sreg .u64 %gridid;\n\nDescription\n\nA predefined, read-only special register initialized with the per-grid temporal grid identifier. The\n\n%gridid is used by debuggers to distinguish CTAs and clusters within concurrent (small) grids.\n\nDuring execution, repeated launches of programs may occur, where each launch starts a\n\ngrid-of-CTAs. This variable provides the temporal grid launch number for this context.\n\nFor sm_1x targets, %gridid is limited to the range [0..216-1]. For sm_20,\n\n%gridid is limited to the range [0..232-1]. sm_30 supports the entire 64-bit range.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 as type .u16.\n\nRedefined as type .u32 in PTX ISA version 1.3.\n\nRedefined as type .u64 in PTX ISA version 3.0.\n\nFor compatibility with legacy PTX code, 16-bit and 32-bit mov and cvt instructions may be\n\nused to read the lower 16-bits or 32-bits of each component of %gridid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u64  %s, %gridid;  // 64-bit read of %gridid\n\nmov.u32  %r, %gridid;  // legacy code with 32-bit %gridid\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-gridid"
            };

        case "is_explicit_cluster":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-is-explicit-cluster\" target=\"_blank\" rel=\"noopener noreferrer\">is_explicit_cluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %is_explicit_cluster</h1><section id=\"special-registers-is-explicit-cluster\">\n\n\n<p>Checks if user has explicitly specified cluster launch.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .pred %is_explicit_cluster;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the predicate value of whether the cluster\nlaunch is explicitly specified by user.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .pred p;\n\nmov.pred  p, %is_explicit_cluster;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Checks if user has explicitly specified cluster launch.\n\nSyntax (predefined)\n\n.sreg .pred %is_explicit_cluster;\n\nDescription\n\nA predefined, read-only special register initialized with the predicate value of whether the cluster\n\nlaunch is explicitly specified by user.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .pred p;\n\nmov.pred  p, %is_explicit_cluster;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-is-explicit-cluster"
            };

        case "isspacep":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep\" target=\"_blank\" rel=\"noopener noreferrer\">isspacep <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: isspacep</h1><section id=\"data-movement-and-conversion-instructions-isspacep\">\n\n\n<p>Query whether a generic address falls within a specified state space window.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>isspacep.space  p, a;    // result is .pred\n\n.space = { const, .global, .local, .shared{::cta, ::cluster}, .param };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write predicate register <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> if generic address a falls within the specified state\nspace window and with <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> otherwise. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>; the source address\noperand must be of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.param</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> if the generic address falls within the window of <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function\nParameters</a>, otherwise returns <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.global</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> for <a class=\"reference external\" href=\"#kernel-function-parameters\">Kernel Function Parameters</a> as <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> window is contained within the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>\nwindow.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ispacep.shared::cluster</span></code> will return 1 for every shared memory address that is accessible to\nthe threads in the cluster, whereas <code class=\"docutils literal notranslate\"><span class=\"pre\">ispacep.shared::cta</span></code> will return 1 only if the address is\nof a variable declared in the executing CTA.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.const</span></code> introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.param</span></code> introduced in PTX ISA version 7.7.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">isspacep.param</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>isspacep.const           iscnst, cptr;\nisspacep.global          isglbl, gptr;\nisspacep.local           islcl,  lptr;\nisspacep.shared          isshrd, sptr;\nisspacep.param           isparam, pptr;\nisspacep.shared::cta     isshrdcta, sptr;\nisspacep.shared::cluster ishrdany sptr;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Query whether a generic address falls within a specified state space window.\n\nSyntax\n\nisspacep.space  p, a;    // result is .pred\n\n.space = { const, .global, .local, .shared{::cta, ::cluster}, .param };\n\nDescription\n\nWrite predicate register p with 1 if generic address a falls within the specified state\n\nspace window and with 0 otherwise. Destination p has type .pred; the source address\n\noperand must be of type .u32 or .u64.\n\nisspacep.param returns 1 if the generic address falls within the window of Kernel Function\n\nParameters, otherwise returns 0.\n\nisspacep.global returns 1 for Kernel Function Parameters as .param window is contained within the .global\n\nwindow.\n\nIf no sub-qualifier is specified with .shared state space, then ::cta is assumed by default.\n\nNote\n\nispacep.shared::cluster will return 1 for every shared memory address that is accessible to\n\nthe threads in the cluster, whereas ispacep.shared::cta will return 1 only if the address is\n\nof a variable declared in the executing CTA.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nisspacep.const introduced in PTX ISA version 3.1.\n\nisspacep.param introduced in PTX ISA version 7.7.\n\nSupport for ::cta and ::cluster sub-qualifiers introduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nisspacep requires sm_20 or higher.\n\nisspacep.param requires sm_70 or higher.\n\nSub-qualifier ::cta requires sm_30 or higher.\n\nSub-qualifier ::cluster requires sm_90 or higher.\n\nExamples\n\nisspacep.const           iscnst, cptr;\n\nisspacep.global          isglbl, gptr;\n\nisspacep.local           islcl,  lptr;\n\nisspacep.shared          isshrd, sptr;\n\nisspacep.param           isparam, pptr;\n\nisspacep.shared::cta     isshrdcta, sptr;\n\nisspacep.shared::cluster ishrdany sptr;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-isspacep"
            };

        case "istypep":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep\" target=\"_blank\" rel=\"noopener noreferrer\">istypep <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Texture Instructions: istypep</h1><section id=\"texture-instructions-istypep\">\n\n\n<p>Query whether a register points to an opaque variable of a specified type.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>istypep.type   p, a;  // result is .pred\n\n.type = { .texref, .samplerref, .surfref };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write predicate register <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> with 1 if register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> points to an opaque variable of the\nspecified type, and with 0 otherwise. Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>; the source address\noperand must be of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>istypep requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>istypep.texref istex, tptr;\nistypep.samplerref issampler, sptr;\nistypep.surfref issurface, surfptr;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Query whether a register points to an opaque variable of a specified type.\n\nSyntax\n\nistypep.type   p, a;  // result is .pred\n\n.type = { .texref, .samplerref, .surfref };\n\nDescription\n\nWrite predicate register p with 1 if register a points to an opaque variable of the\n\nspecified type, and with 0 otherwise. Destination p has type .pred; the source address\n\noperand must be of type .u64.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.0.\n\nTarget ISA Notes\n\nistypep requires sm_30 or higher.\n\nExamples\n\nistypep.texref istex, tptr;\n\nistypep.samplerref issampler, sptr;\n\nistypep.surfref issurface, surfptr;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-istypep"
            };

        case "laneid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-laneid\" target=\"_blank\" rel=\"noopener noreferrer\">laneid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %laneid</h1><section id=\"special-registers-laneid\">\n\n\n<p>Lane Identifier.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %laneid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the thread\u2019s lane within the warp. The lane\nidentifier ranges from zero to <code class=\"docutils literal notranslate\"><span class=\"pre\">WARP_SZ-1</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %laneid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Lane Identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %laneid;\n\nDescription\n\nA predefined, read-only special register that returns the thread\u2019s lane within the warp. The lane\n\nidentifier ranges from zero to WARP_SZ-1.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r, %laneid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-laneid"
            };

        case "lanemask_eq":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-eq\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_eq <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_eq</h1><section id=\"special-registers-lanemask-eq\">\n\n\n<p>32-bit mask with bit set in position equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_eq;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with a bit set in the\nposition equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_eq</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_eq;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bit set in position equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_eq;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with a bit set in the\n\nposition equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_eq requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_eq;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-eq"
            };

        case "lanemask_ge":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-ge\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_ge <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_ge</h1><section id=\"special-registers-lanemask-ge\">\n\n\n<p>32-bit mask with bits set in positions greater than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_ge;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\ngreater than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_ge</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_ge;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions greater than or equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_ge;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\ngreater than or equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_ge requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_ge;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-ge"
            };

        case "lanemask_gt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-gt\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_gt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_gt</h1><section id=\"special-registers-lanemask-gt\">\n\n\n<p>32-bit mask with bits set in positions greater than the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_gt;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\ngreater than the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_gt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_gt;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions greater than the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_gt;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\ngreater than the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_gt requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_gt;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-gt"
            };

        case "lanemask_le":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-le\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_le <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_le</h1><section id=\"special-registers-lanemask-le\">\n\n\n<p>32-bit mask with bits set in positions less than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_le;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\nless than or equal to the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_le</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_le\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions less than or equal to the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_le;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\nless than or equal to the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_le requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_le\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-le"
            };

        case "lanemask_lt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-lt\" target=\"_blank\" rel=\"noopener noreferrer\">lanemask_lt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %lanemask_lt</h1><section id=\"special-registers-lanemask-lt\">\n\n\n<p>32-bit mask with bits set in positions less than the thread\u2019s lane number in the warp.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %lanemask_lt;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with a 32-bit mask with bits set in positions\nless than the thread\u2019s lane number in the warp.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%lanemask_lt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32     %r, %lanemask_lt;\n</pre></div>\n</div>\n</section>",
                "tooltip": "32-bit mask with bits set in positions less than the thread\u2019s lane number in the warp.\n\nSyntax (predefined)\n\n.sreg .u32 %lanemask_lt;\n\nDescription\n\nA predefined, read-only special register initialized with a 32-bit mask with bits set in positions\n\nless than the thread\u2019s lane number in the warp.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%lanemask_lt requires sm_20 or higher.\n\nExamples\n\nmov.u32     %r, %lanemask_lt;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-lanemask-lt"
            };

        case "ld":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld\" target=\"_blank\" rel=\"noopener noreferrer\">ld <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld-global-nc\" target=\"_blank\" rel=\"noopener noreferrer\">ld.global.nc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: ld</h1><section id=\"data-movement-and-conversion-instructions-ld\">\n\n\n<p>Load a register variable from an addressable state space variable.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ld{.weak}{.ss}{.cop}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld.volatile{.ss}{.level::prefetch_size}{.vec}.type  d, [a];\n\nld.relaxed.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{, cache-policy};\n\nld.acquire.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{, cache-policy};\n\n.ss =                       { .const, .global, .local, .param, .shared{::cta, ::cluster} };\n.cop =                      { .ca, .cg, .cs, .lu, .cv };\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate };\n.level::cache_hint =        { .L2::cache_hint };\n.level::prefetch_size =     { .L2::64B, .L2::128B, .L2::256B }\n.scope =                    { .cta, .cluster, .gpu, .sys };\n.vec =                      { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Load register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> from the location specified by the source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in\nspecified state space. If no state space is given, perform the load using <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code> used for reading value returned from device function call cannot be\npredicated. See <a class=\"reference external\" href=\"#parameter-state-space\">Parameter State Space</a> and <a class=\"reference external\" href=\"#function-declarations-and-definitions\">Function\nDeclarations and Definitions</a> for descriptions\nof the proper use of <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.param</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> qualifiers indicate memory synchronization as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier\nindicates the set of threads with which an <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.relaxed</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.acquire</span></code> instruction can directly\nsynchronize<sup>1</sup>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifier indicates a memory instruction with no\nsynchronization. The effects of this instruction become visible to other threads only when\nsynchronization is established by other means.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.volatile</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> qualifiers are mutually exclusive. When\nnone of these is specified, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifier is assumed by default.</p>\n<p>An <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.volatile</span></code> operation is always performed and it will not be reordered with respect to other\n<code class=\"docutils literal notranslate\"><span class=\"pre\">volatile</span></code> operations to the same memory location. <code class=\"docutils literal notranslate\"><span class=\"pre\">volatile</span></code> and non-volatile load operations\nto the same memory location may be reordered. <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.volatile</span></code> has the same memory synchronization\nsemantics as <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.relaxed.sys</span></code>.</p>\n<p>The qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.volatile</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code> may be used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> spaces and with generic addressing, where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> space. Cache operations are not permitted with these qualifiers.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> must be specified on operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> if <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is the address of a\nvariable declared with <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> attribute as described in <a class=\"reference external\" href=\"#variable-and-function-attribute-directive-attribute\">Variable and Function Attribute\nDirective: .attribute</a>.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> specifies the eviction policy that will be used during\nmemory access.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is a hint to fetch additional data of the specified size\ninto the respective cache level.The sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch_size</span></code> can be set to either of <code class=\"docutils literal notranslate\"><span class=\"pre\">64B</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">128B</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">256B</span></code> thereby allowing the prefetch size to be 64 Bytes, 128 Bytes or 256 Bytes\nrespectively.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> may only be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and with\ngeneric addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space. If the generic address does\nnot fall within the address window of the global memory, then the prefetching behavior is undefined.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier is treated as a performance hint only.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> are only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace and for generic addressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><sup>1</sup> This synchronization is further extended to other threads through the transitive nature of\n<em>causality order</em>, as described in the memory consistency model.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a;             // named variable a\nd = *(&amp;a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types. See\n<a class=\"reference internal\" href=\"#operand-size-exceeding-instruction-type-size-relaxed-type-checking-rules-destination-operands\"><span class=\"std std-numref\">Table 25</span></a>\nfor a description of these relaxed type-checking rules.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.b16</span></code>, and then converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> using\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> or can be used in half precision floating point instructions.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.b32</span></code> and then used in half precision floating point\ninstructions.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>ld introduced in PTX ISA version 1.0. <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.volatile</span></code> introduced in PTX ISA version 1.1.</p>\n<p>Generic addressing and cache operations introduced in PTX ISA version 2.0.</p>\n<p>Support for scope qualifier, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifiers introduced in PTX ISA\nversion 6.0.</p>\n<p>Support for generic addressing of .const space added in PTX ISA version 3.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code>\nqualifiers introduced in PTX ISA version 7.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> qualifier introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ld.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Support for scope qualifier, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acquire</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifiers require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or\nhigher.</p>\n<p>Generic addressing requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Cache operations require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::prefetch_size</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::256B</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.L2::cache_hint</span></code> qualifiers requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.unified</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ld.global.f32    d,[a];\nld.shared.v4.b32 Q,[p];\nld.const.s32     d,[p+4];\nld.local.b32     x,[p+-8]; // negative offset\nld.local.b64     x,[240];  // immediate address\n\nld.global.b16    %r,[fs];  // load .f16 data into 32-bit reg\ncvt.f32.f16      %r,%r;    // up-convert f16 data to f32\n\nld.global.b32    %r0, [fs];     // load .f16x2 data in 32-bit reg\nld.global.b32    %r1, [fs + 4]; // load .f16x2 data in 32-bit reg\nadd.rn.f16x2     %d0, %r0, %r1; // addition of f16x2 data\nld.global.relaxed.gpu.u32 %r0, [gbl];\nld.shared.acquire.gpu.u32 %r1, [sh];\nld.global.relaxed.cluster.u32 %r2, [gbl];\nld.shared::cta.acquire.gpu.u32 %r2, [sh + 4];\nld.shared::cluster.u32 %r3, [sh + 8];\n\nld.global.f32    d,[ugbl].unified;\nld.b32           %r0, [%r1].unified;\n\nld.global.L1::evict_last.u32  d, [p];\n\nld.global.L2::64B.b32   %r0, [gbl]; // Prefetch 64B to L2\nld.L2::128B.f64         %r1, [gbl]; // Prefetch 128B to L2\nld.global.L2::256B.f64  %r2, [gbl]; // Prefetch 256B to L2\n\ncreatepolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 cache-policy, 1;\nld.global.L2::cache_hint.b64  x, [p], cache-policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: ld.global.nc</h1><section id=\"data-movement-and-conversion-instructions-ld-global-nc\">\n\n\n<p>Load a register variable from global state space via non-coherent cache.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ld.global{.cop}.nc{.level::cache_hint}.type                 d, [a]{, cache-policy};\nld.global{.cop}.nc{.level::cache_hint}.vec.type             d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint}.type      d, [a]{, cache-policy};\nld.global.nc{.level::eviction_priority}{.level::cache_hint}.vec.type  d, [a]{, cache-policy};\n\n.cop  =                     { .ca, .cg, .cs };     // cache operation\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate};\n.level::cache_hint =        { .L2::cache_hint };\n.vec  =                     { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Load register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> from the location specified by the source address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in the\nglobal state space, and optionally cache in non-coherent read-only cache.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>On some architectures, the texture cache is larger, has higher bandwidth, and longer latency than\nthe global memory cache. For applications with sufficient parallelism to cover the longer\nlatency, <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.global.nc</span></code> should offer better performance than <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.global</span></code> on such\narchitectures.</p>\n</div>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> specifies the eviction policy that will be used during\nmemory access.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a;             // named variable a\nd = *(&amp;a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.b16</span></code>, and then converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> using <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifiers introduced in PTX\nISA version 7.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ld.global.nc.f32           d, [a];\nld.gloal.nc.L1::evict_last.u32 d, [a];\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.5;\nld.global.nc.L2::cache_hint.f32  d, [a], cache-policy;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: ld\n\n\n\nLoad a register variable from an addressable state space variable.\n\nSyntax\n\nld{.weak}{.ss}{.cop}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.level::prefetch_size}{.vec}.type  d, [a]{.unified}{, cache-policy};\n\nld.volatile{.ss}{.level::prefetch_size}{.vec}.type  d, [a];\n\nld.relaxed.scope{.ss}{.le...\n\n=====Data Movement and Conversion Instructions: ld.global.nc\n\n\n\nLoad a register variable from global state space via non-coherent cache.\n\nSyntax\n\nld.global{.cop}.nc{.level::cache_hint}.type                 d, [a]{, cache-policy};\n\nld.global{.cop}.nc{.level::cache_hint}.vec.type             d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint}.type      d, [a]{, cache-policy};\n\nld.global.nc{.level::eviction_priority}{.level::cache_hint... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld"
            };

        case "ldu":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu\" target=\"_blank\" rel=\"noopener noreferrer\">ldu <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: ldu</h1><section id=\"data-movement-and-conversion-instructions-ldu\">\n\n\n<p>Load read-only data from an address that is common across threads in the warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ldu{.ss}.type      d, [a];       // load from address\nldu{.ss}.vec.type  d, [a];       // vec load from address\n\n.ss   = { .global };             // state space\n.vec  = { .v2, .v4 };\n.type = { .b8, .b16, .b32, .b64,\n           .u8, .u16, .u32, .u64,\n           .s8, .s16, .s32, .s64,\n                      .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Load <em>read-only</em> data into register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> from the location specified by the source address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in the global state space, where the address is guaranteed to be the same across all\nthreads in the warp. If no state space is given, perform the load using <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a;             // named variable a\nd = *(&amp;a+immOff)   // variable-plus-offset\nd = *a;            // register\nd = *(a+immOff);   // register-plus-offset\nd = *(immAddr);    // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A destination register wider than the specified type may be used. The value loaded is sign-extended\nto the destination register width for signed integers, and is zero-extended to the destination\nregister width for unsigned and bit-size types. See\n<a class=\"reference internal\" href=\"#operand-size-exceeding-instruction-type-size-relaxed-type-checking-rules-destination-operands\"><span class=\"std std-numref\">Table 25</span></a>\nfor a description of these relaxed type-checking rules.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ldu.b16</span></code>, and then converted to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> using\n<code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code>or can be used in half precision floating point instructions.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data may be loaded using <code class=\"docutils literal notranslate\"><span class=\"pre\">ldu.b32</span></code> and then used in half precision floating point\ninstructions.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">ldu.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ldu.global.f32    d,[a];\nldu.global.b32    d,[p+4];\nldu.global.v4.f32 Q,[p];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Load read-only data from an address that is common across threads in the warp.\n\nSyntax\n\nldu{.ss}.type      d, [a];       // load from address\n\nldu{.ss}.vec.type  d, [a];       // vec load from address\n\n.ss   = { .global };             // state space\n\n.vec  = { .v2, .v4 };\n\n.type = { .b8, .b16, .b32, .b64,\n\n           .u8, .u16, .u32, .u64,\n\n           .s8, .s16, .s32, .s64,\n\n                      .f32, .f64 };\n\nDescription\n\nLoad read-only data into register variable d from the location specified by the source address\n\noperand a in the global state space, where the address is guaranteed to be the same across all\n\nthreads in the warp. If no state space is given, perform the load using Generic Addressing.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nSemantics\n\nd = a;             // named variable a\n\nd = *(&a+immOff)   // variable-plus-offset\n\nd = *a;            // register\n\nd = *(a+immOff);   // register-plus-offset\n\nd = *(immAddr);    // immediate address\n\nNotes\n\nDestination d must be in the .reg state space.\n\nA destination register wider than the specified type may be used. The value loaded is sign-extended\n\nto the destination register width for signed integers, and is zero-extended to the destination\n\nregister width for unsigned and bit-size types. See\n\nTable 25\n\nfor a description of these relaxed type-checking rules.\n\n.f16 data may be loaded using ldu.b16, and then converted to .f32 or .f64 using\n\ncvtor can be used in half precision floating point instructions.\n\n.f16x2 data may be loaded using ldu.b32 and then used in half precision floating point\n\ninstructions.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nldu.f64 requires sm_13 or higher.\n\nExamples\n\nldu.global.f32    d,[a];\n\nldu.global.b32    d,[p+4];\n\nldu.global.v4.f32 Q,[p];\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ldu"
            };

        case "lg2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2\" target=\"_blank\" rel=\"noopener noreferrer\">lg2(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: lg2</h1><section id=\"floating-point-instructions-lg2\">\n\n\n<p>Find the base-2 logarithm of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>lg2.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Determine the log<sub>2</sub> of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = log(a) / log(2);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.approx.f32</span></code> implements a fast approximation to log<sub>2</sub>(a).</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-22.6</sup> for mantissa.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">lg2.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>lg2.approx.ftz.f32  la, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the base-2 logarithm of a value.\n\nSyntax\n\nlg2.approx{.ftz}.f32  d, a;\n\nDescription\n\nDetermine the log2 of a.\n\nSemantics\n\nd = log(a) / log(2);\n\nNotes\n\nlg2.approx.f32 implements a fast approximation to log2(a).\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-subnormal\n\n-Inf\n\n-0.0\n\n-Inf\n\n+0.0\n\n-Inf\n\n+subnormal\n\n-Inf\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-22.6 for mantissa.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nlg2.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nlg2.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, lg2.f32 defaults to lg2.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nlg2.approx.ftz.f32  la, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2"
            };

        case "loc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc\" target=\"_blank\" rel=\"noopener noreferrer\">loc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Debugging Directives: .loc</h1><section id=\"debugging-directives-loc\">\n\n\n<p>Source file location.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.loc file_index line_number column_position\n.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares the source file location (source file, line number, and column position) to be associated\nwith lexically subsequent PTX instructions. <code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> refers to <code class=\"docutils literal notranslate\"><span class=\"pre\">file_index</span></code> which is defined by a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.file</span></code> directive.</p>\n<p>To indicate PTX instructions that are generated from a function that got inlined, additional\nattribute <code class=\"docutils literal notranslate\"><span class=\"pre\">.inlined_at</span></code> can be specified as part of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> directive. <code class=\"docutils literal notranslate\"><span class=\"pre\">.inlined_at</span></code>\nattribute specifies source location at which the specified function is inlined. <code class=\"docutils literal notranslate\"><span class=\"pre\">file_index2</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">line_number2</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">column_position2</span></code> specify the location at which function is inlined. Source\nlocation specified as part of <code class=\"docutils literal notranslate\"><span class=\"pre\">.inlined_at</span></code> directive must lexically precede as source location in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.loc</span></code> directive.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">function_name</span></code> attribute specifies an offset in the DWARF section named\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.debug_str</span></code>. Offset is specified as <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span></code> expression or <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span> <span class=\"pre\">+</span> <span class=\"pre\">immediate</span></code> expression\nwhere <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span></code> is defined in <code class=\"docutils literal notranslate\"><span class=\"pre\">.debug_str</span></code> section. DWARF section <code class=\"docutils literal notranslate\"><span class=\"pre\">.debug_str</span></code> contains ASCII\nnull-terminated strings that specify the name of the function that is inlined.</p>\n<p>Note that a PTX instruction may have a single associated source location, determined by the nearest\nlexically preceding .loc directive, or no associated source location if there is no preceding .loc\ndirective. Labels in PTX inherit the location of the closest lexically following instruction. A\nlabel with no following PTX instruction has no associated source location.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">function_name</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">inlined_at</span></code> attributes are introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    .loc 2 4237 0\nL1:                        // line 4237, col 0 of file #2,\n                           // inherited from mov\n    mov.u32  %r1,%r2;      // line 4237, col 0 of file #2\n    add.u32  %r2,%r1,%r3;  // line 4237, col 0 of file #2\n...\nL2:                        // line 4239, col 5 of file #2,\n                           // inherited from sub\n    .loc 2 4239 5\n    sub.u32  %r2,%r1,%r3;  // line 4239, col 5 of file #2\n    .loc 1 21 3\n    .loc 1 9 3, function_name info_string0, inlined_at 1 21 3\n    ld.global.u32   %r1, [gg]; // Function at line 9\n    setp.lt.s32 %p1, %r1, 8;   // inlined at line 21\n    .loc 1 27 3\n    .loc 1 10 5, function_name info_string1, inlined_at 1 27 3\n    .loc 1 15 3, function_name .debug_str+16, inlined_at 1 10 5\n    setp.ne.s32 %p2, %r1, 18;\n    @%p2 bra    BB2_3;\n\n    .section .debug_str {\n    info_string0:\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 102 // f\n     .b8 111 // o\n     .b8 111 // o\n     .b8 118 // v\n     .b8 0\n\n    info_string1:\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 98  // b\n     .b8 97  // a\n     .b8 114 // r\n     .b8 118 // v\n     .b8 0\n     .b8 95  // _\n     .b8 90  // z\n     .b8 51  // 3\n     .b8 99  // c\n     .b8 97  // a\n     .b8 114 // r\n     .b8 118 // v\n     .b8 0\n    }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Source file location.\n\nSyntax\n\n.loc file_index line_number column_position\n\n.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2\n\nDescription\n\nDeclares the source file location (source file, line number, and column position) to be associated\n\nwith lexically subsequent PTX instructions. .loc refers to file_index which is defined by a\n\n.file directive.\n\nTo indicate PTX instructions that are generated from a function that got inlined, additional\n\nattribute .inlined_at can be specified as part of the .loc directive. .inlined_at\n\nattribute specifies source location at which the specified function is inlined. file_index2,\n\nline_number2, and column_position2 specify the location at which function is inlined. Source\n\nlocation specified as part of .inlined_at directive must lexically precede as source location in\n\n.loc directive.\n\nThe function_name attribute specifies an offset in the DWARF section named\n\n.debug_str. Offset is specified as label expression or label + immediate expression\n\nwhere label is defined in .debug_str section. DWARF section .debug_str contains ASCII\n\nnull-terminated strings that specify the name of the function that is inlined.\n\nNote that a PTX instruction may have a single associated source location, determined by the nearest\n\nlexically preceding .loc directive, or no associated source location if there is no preceding .loc\n\ndirective. Labels in PTX inherit the location of the closest lexically following instruction. A\n\nlabel with no following PTX instruction has no associated source location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nfunction_name and inlined_at attributes are introduced in PTX ISA version 7.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n    .loc 2 4237 0\n\nL1:                        // line 4237, col 0 of file #2,\n\n                           // inherited from mov\n\n    mov.u32  %r1,%r2;      // line 4237, col 0 of file #2\n\n    add.u32  %r2,%r1,%r3;  // line 4237, col 0 of file #2\n\n...\n\nL2:                        // line 4239, col 5 of file #2,\n\n                           // inherited from sub\n\n    .loc 2 4239 5\n\n    sub.u32  %r2,%r1,%r3;  // line 4239, col 5 of file #2\n\n    .loc 1 21 3\n\n    .loc 1 9 3, function_name info_string0, inlined_at 1 21 3\n\n    ld.global.u32   %r1, [gg]; // Function at line 9\n\n    setp.lt.s32 %p1, %r1, 8;   // inlined at line 21\n\n    .loc 1 27 3\n\n    .loc 1 10 5, function_name info_string1, inlined_at 1 27 3\n\n    .loc 1 15 3, function_name .debug_str+16, inlined_at 1 10 5\n\n    setp.ne.s32 %p2, %r1, 18;\n\n    @%p2 bra    BB2_3;\n\n    .section .debug_str {\n\n    info_string0:\n\n     .b8 95  // _\n\n     .b8 90  // z\n\n     .b8 51  // 3\n\n     .b8 102 // f\n\n     .b8 111 // o\n\n     .b8 111 // o\n\n     .b8 118 // v\n\n     .b8 0\n\n    info_string1:\n\n     .b8 95  // _\n\n     .b8 90  // z\n\n     .b8 51  // 3\n\n     .b8 98  // b\n\n     .b8 97  // a\n\n     .b8 114 // r\n\n     .b8 118 // v\n\n     .b8 0\n\n     .b8 95  // _\n\n     .b8 90  // z\n\n     .b8 51  // 3\n\n     .b8 99  // c\n\n     .b8 97  // a\n\n     .b8 114 // r\n\n     .b8 118 // v\n\n     .b8 0\n\n    }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc"
            };

        case "lop3":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3\" target=\"_blank\" rel=\"noopener noreferrer\">lop3 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: lop3</h1><section id=\"logic-and-shift-instructions-lop3\">\n\n\n<p>Arbitrary logical operation on 3 inputs.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>lop3.b32 d, a, b, c, immLut;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute bitwise logical operation on inputs <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> and store the result in destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>The logical operation is defined by a look-up table which, for 3 inputs, can be represented as an\n8-bit value specified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> as described below. <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> is an integer constant\nthat can take values from 0 to 255, thereby allowing up to 256 distinct logical operations on inputs\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>.</p>\n<p>For a logical operation <code class=\"docutils literal notranslate\"><span class=\"pre\">F(a,</span> <span class=\"pre\">b,</span> <span class=\"pre\">c)</span></code> the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> can be computed by applying the same\noperation to three predefined constant values as follows:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ta = 0xF0;\ntb = 0xCC;\ntc = 0xAA;\n\nimmLut = F(ta, tb, tc);\n</pre></div>\n</div>\n<p>Examples:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>If F = (a &amp; b &amp; c);\nimmLut = 0xF0 &amp; 0xCC &amp; 0xAA = 0x80\n\nIf F = (a | b | c);\nimmLut = 0xF0 | 0xCC | 0xAA = 0xFE\n\nIf F = (a &amp; b &amp; ~c);\nimmLut = 0xF0 &amp; 0xCC &amp; (~0xAA) = 0x40\n\nIf F = ((a &amp; b | c) ^ a);\nimmLut = (0xF0 &amp; 0xCC | 0xAA) ^ 0xF0 = 0x1A\n</pre></div>\n</div>\n<p>The following table illustrates computation of <code class=\"docutils literal notranslate\"><span class=\"pre\">immLut</span></code> for various logical operations:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 9%\"/>\n<col style=\"width: 3%\"/>\n<col style=\"width: 3%\"/>\n<col style=\"width: 12%\"/>\n<col style=\"width: 17%\"/>\n<col style=\"width: 18%\"/>\n<col style=\"width: 7%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 13%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>ta</p></th>\n<th class=\"head\"><p>tb</p></th>\n<th class=\"head\"><p>tc</p></th>\n<th class=\"head\"><p>Oper 0 (False)</p></th>\n<th class=\"head\"><p>Oper 1 (ta &amp; tb &amp; tc)</p></th>\n<th class=\"head\"><p>Oper 2 (ta &amp; tb &amp; ~tc)</p></th>\n<th class=\"head\"><p>\u2026</p></th>\n<th class=\"head\"><p>Oper 254 (ta | tb | tc)</p></th>\n<th class=\"head\"><p>Oper 255 (True)</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td rowspan=\"8\"><p>\u2026</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td colspan=\"3\"><p><strong>immLut</strong></p></td>\n<td><p><strong>0x0</strong></p></td>\n<td><p><strong>0x80</strong></p></td>\n<td><p><strong>0x40</strong></p></td>\n<td><p><strong>\u2026</strong></p></td>\n<td><p><strong>0xFE</strong></p></td>\n<td><p><strong>0xFF</strong></p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>F = GetFunctionFromTable(immLut); // returns the function corresponding to immLut value\nd = F(a, b, c);\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>lop3.b32  d, a, b, c, 0x40;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Arbitrary logical operation on 3 inputs.\n\nSyntax\n\nlop3.b32 d, a, b, c, immLut;\n\nDescription\n\nCompute bitwise logical operation on inputs a, b, c and store the result in destination\n\nd.\n\nThe logical operation is defined by a look-up table which, for 3 inputs, can be represented as an\n\n8-bit value specified by operand immLut as described below. immLut is an integer constant\n\nthat can take values from 0 to 255, thereby allowing up to 256 distinct logical operations on inputs\n\na, b, c.\n\nFor a logical operation F(a, b, c) the value of immLut can be computed by applying the same\n\noperation to three predefined constant values as follows:\n\nta = 0xF0;\n\ntb = 0xCC;\n\ntc = 0xAA;\n\nimmLut = F(ta, tb, tc);\n\nExamples:\n\nIf F = (a & b & c);\n\nimmLut = 0xF0 & 0xCC & 0xAA = 0x80\n\nIf F = (a | b | c);\n\nimmLut = 0xF0 | 0xCC | 0xAA = 0xFE\n\nIf F = (a & b & ~c);\n\nimmLut = 0xF0 & 0xCC & (~0xAA) = 0x40\n\nIf F = ((a & b | c) ^ a);\n\nimmLut = (0xF0 & 0xCC | 0xAA) ^ 0xF0 = 0x1A\n\nThe following table illustrates computation of immLut for various logical operations:\n\n\n\n\n\n\n\nta\n\ntb\n\ntc\n\nOper 0 (False)\n\nOper 1 (ta & tb & tc)\n\nOper 2 (ta & tb & ~tc)\n\n\u2026\n\nOper 254 (ta | tb | tc)\n\nOper 255 (True)\n\n0\n\n0\n\n0\n\n0\n\n0\n\n0\n\n\u2026\n\n0\n\n1\n\n0\n\n0\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n0\n\n1\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n0\n\n1\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n0\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n1\n\n1\n\n0\n\n1\n\n0\n\n1\n\n1\n\nimmLut\n\n0x0\n\n0x80\n\n0x40\n\n\u2026\n\n0xFE\n\n0xFF\n\nSemantics\n\nF = GetFunctionFromTable(immLut); // returns the function corresponding to immLut value\n\nd = F(a, b, c);\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nRequires sm_50 or higher.\n\nExamples\n\nlop3.b32  d, a, b, c, 0x40;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3"
            };

        case "mad":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad\" target=\"_blank\" rel=\"noopener noreferrer\">mad(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad\" target=\"_blank\" rel=\"noopener noreferrer\">mad(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc\" target=\"_blank\" rel=\"noopener noreferrer\">mad.cc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: mad</h1><section id=\"floating-point-instructions-mad\">\n\n\n<p>Multiply two values and add a third value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mad{.ftz}{.sat}.f32      d, a, b, c;    // .target sm_1x\nmad.rnd{.ftz}{.sat}.f32  d, a, b, c;    // .target sm_20\nmad.rnd.f64              d, a, b, c;    // .target sm_13 and higher\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values and adds a third, and then writes the resulting value into a destination\nregister.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a*b + c;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">sm_20</span></code> and higher:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to single precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.{f32,f64}</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">fma.{f32,f64}</span></code>.</p></li>\n</ul>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">sm_1x</span></code>:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> at double precision, and then the mantissa is\ntruncated to 23 bits, but the exponent is preserved. Note that this is different from computing\nthe product with <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>, where the mantissa can be rounded and the exponent will be clamped. The\nexception for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> is when <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span> <span class=\"pre\">=</span> <span class=\"pre\">+/-0.0</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> is identical to the result computed\nusing separate mul and add instructions. When JIT-compiled for SM 2.0 devices, <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> is\nimplemented as a fused multiply-add (i.e., <code class=\"docutils literal notranslate\"><span class=\"pre\">fma.rn.ftz.f32</span></code>). In this case, <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> can\nproduce slightly different numeric results and backward compatibility is not guaranteed in this\ncase.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> computes the product of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to infinite precision and then adds <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to\nthis product, again in infinite precision. The resulting value is then rounded to double precision\nusing the rounding mode specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code>. Unlike <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code>, the treatment of subnormal\ninputs and output follows IEEE 754 standard.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> is the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">fma.f64</span></code>.</p></li>\n</ul>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>In PTX ISA versions 1.4 and later, a rounding modifier is required for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code>.</p>\n<p>Legacy <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> instructions having no rounding modifier will map to <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.rn.f64</span></code>.</p>\n<p>In PTX ISA versions 2.0 and later, a rounding modifier is required for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> and higher targets.</p>\n<p><strong>Errata</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> requires a rounding modifier for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> and higher targets. However for PTX ISA\nversion 3.0 and earlier, ptxas does not enforce this requirement and <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> silently defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.rn.f32</span></code>. For PTX ISA version 3.1, ptxas generates a warning and defaults to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mad.rn.f32</span></code>, and in subsequent releases ptxas will enforce the requirement for PTX ISA version\n3.2 and later.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>,<code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p></li>\n</ul>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  mad.f32  d,a,b,c;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: mad</h1><section id=\"integer-arithmetic-instructions-mad\">\n\n\n<p>Multiply two values, optionally extract the high or low half of the intermediate result, and add a third value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mad.mode.type  d, a, b, c;\nmad.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo, .wide };\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values, optionally extracts the high or low half of the intermediate result, and adds\na third value. Writes the result into a destination register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = a * b;\nn = bitwidth of type;\nd = t + c;           // for .wide\nd = t&lt;2n-1..n&gt; + c;  // for .hi variant\nd = t&lt;n-1..0&gt; + c;   // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The type of the operation represents the types of the <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operands. If .hi or .lo is\nspecified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are the same size as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and either the upper or lower\nhalf of the result is written to the destination register. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> is specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are twice as wide as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to receive the result of the multiplication.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> suffix is supported only for 16-bit and 32-bit integer types.</p>\n<p>Saturation modifier:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt><dd><p>limits result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow) for the size of the operation.</p>\n<p>Applies only to <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type in <code class=\"docutils literal notranslate\"><span class=\"pre\">.hi</span></code> mode.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  mad.lo.s32 d,a,b,c;\n    mad.lo.s32 r,p,q,r;\n</pre></div>\n</div>\n</section>\n<h1>Extended-Precision Arithmetic Instructions: mad.cc</h1><section id=\"extended-precision-arithmetic-instructions-mad-cc\">\n\n\n<p>Multiply two values, extract high or low half of result, and add a third value with carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mad{.hi,.lo}.cc.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values, extracts either the high or low part of the result, and adds a third\nvalue. Writes the result to the destination register and the carry-out from the addition into the\ncondition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;63..32&gt; + c;    // for .hi variant\nd = t&lt;31..0&gt; + c;     // for .lo variant\n</pre></div>\n</div>\n<p>carry-out from addition is written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>Generally used in combination with <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> to implement extended-precision multi-word\nmultiplication. See <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> for an example.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.cc</span></code> introduced in PTX ISA version 3.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.cc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  mad.lo.cc.u32 d,a,b,c;\n    mad.lo.cc.u32 r,p,q,r;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: mad\n\n\n\nMultiply two values and add a third value.\n\nSyntax\n\nmad{.ftz}{.sat}.f32      d, a, b, c;    // .target sm_1x\n\nmad.rnd{.ftz}{.sat}.f32  d, a, b, c;    // .target sm_20\n\nmad.rnd.f64              d, a, b, c;    // .target sm_13 and higher\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nMultiplies two values and adds a third, and then writes the resulting value into a destination\n\nregister.\n\nSemantics\n\nd = a*b + ...\n\n=====Integer Arithmetic Instructions: mad\n\n\n\nMultiply two values, optionally extract the high or low half of the intermediate result, and add a third value.\n\nSyntax\n\nmad.mode.type  d, a, b, c;\n\nmad.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo, .wide };\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nMultiplies two values, optionally extracts the high or low half of the intermediate result, and adds\n\na third value. Writes the r...\n\n=====Extended-Precision Arithmetic Instructions: mad.cc\n\n\n\nMultiply two values, extract high or low half of result, and add a third value with carry-out.\n\nSyntax\n\nmad{.hi,.lo}.cc.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nMultiplies two values, extracts either the high or low part of the result, and adds a third\n\nvalue. Writes the result to the destination register and the carry-out from the addition into the\n\ncondition code register.\n\nS... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mad"
            };

        case "mad24":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad24\" target=\"_blank\" rel=\"noopener noreferrer\">mad24(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: mad24</h1><section id=\"integer-arithmetic-instructions-mad24\">\n\n\n<p>Multiply two 24-bit integer values and add a third value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mad24.mode.type  d, a, b, c;\nmad24.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo };\n.type = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two 24-bit integer values held in 32-bit source registers, and add a third,\n32-bit value to either the high or low 32-bits of the 48-bit result. Return either the high or low\n32-bits of the 48-bit result.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;47..16&gt; + c;   // for .hi variant\nd = t&lt;31..0&gt; + c;    // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Integer multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad24.hi</span></code> performs a 24x24-bit multiply and adds the high 32 bits of the 48-bit result to a third\nvalue.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad24.lo</span></code> performs a 24x24-bit multiply and adds the low 32 bits of the 48-bit result to a third\nvalue.</p>\n<p>All operands are of the same type and size.</p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt><dd><p>limits result of 32-bit signed addition to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow). Applies only to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type in .hi mode.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mad24.hi</span></code> may be less efficient on machines without hardware support for 24-bit multiply.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mad24.lo.s32 d,a,b,c;   // low 32-bits of 24x24-bit signed multiply.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Multiply two 24-bit integer values and add a third value.\n\nSyntax\n\nmad24.mode.type  d, a, b, c;\n\nmad24.hi.sat.s32 d, a, b, c;\n\n.mode = { .hi, .lo };\n\n.type = { .u32, .s32 };\n\nDescription\n\nCompute the product of two 24-bit integer values held in 32-bit source registers, and add a third,\n\n32-bit value to either the high or low 32-bits of the 48-bit result. Return either the high or low\n\n32-bits of the 48-bit result.\n\nSemantics\n\nt = a * b;\n\nd = t<47..16> + c;   // for .hi variant\n\nd = t<31..0> + c;    // for .lo variant\n\nNotes\n\nInteger multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.\n\nmad24.hi performs a 24x24-bit multiply and adds the high 32 bits of the 48-bit result to a third\n\nvalue.\n\nmad24.lo performs a 24x24-bit multiply and adds the low 32 bits of the 48-bit result to a third\n\nvalue.\n\nAll operands are of the same type and size.\n\nSaturation modifier:\n\n.satlimits result of 32-bit signed addition to MININT..MAXINT (no overflow). Applies only to\n\n.s32 type in .hi mode.\n\nmad24.hi may be less efficient on machines without hardware support for 24-bit multiply.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmad24.lo.s32 d,a,b,c;   // low 32-bits of 24x24-bit signed multiply.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mad24"
            };

        case "madc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc\" target=\"_blank\" rel=\"noopener noreferrer\">madc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Extended-Precision Arithmetic Instructions: madc</h1><section id=\"extended-precision-arithmetic-instructions-madc\">\n\n\n<p>Multiply two values, extract high or low half of result, and add a third value with carry-in and\noptional carry-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>madc{.hi,.lo}{.cc}.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Multiplies two values, extracts either the high or low part of the result, and adds a third value\nalong with carry-in. Writes the result to the destination register and optionally writes the\ncarry-out from the addition into the condition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;63..32&gt; + c + CC.CF;     // for .hi variant\nd = t&lt;31..0&gt; + c + CC.CF;      // for .lo variant\n</pre></div>\n</div>\n<p>if <code class=\"docutils literal notranslate\"><span class=\"pre\">.cc</span></code> specified, carry-out from addition is written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>Generally used in combination with <code class=\"docutils literal notranslate\"><span class=\"pre\">mad.cc</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">addc</span></code> to implement extended-precision\nmulti-word multiplication. See example below.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> introduced in PTX ISA version 3.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">madc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extended-precision multiply:  [r3,r2,r1,r0] = [r5,r4] * [r7,r6]\nmul.lo.u32     r0,r4,r6;      // r0=(r4*r6).[31:0], no carry-out\nmul.hi.u32     r1,r4,r6;      // r1=(r4*r6).[63:32], no carry-out\nmad.lo.cc.u32  r1,r5,r6,r1;   // r1+=(r5*r6).[31:0], may carry-out\nmadc.hi.u32    r2,r5,r6,0;    // r2 =(r5*r6).[63:32]+carry-in,\n                              // no carry-out\nmad.lo.cc.u32   r1,r4,r7,r1;  // r1+=(r4*r7).[31:0], may carry-out\nmadc.hi.cc.u32  r2,r4,r7,r2;  // r2+=(r4*r7).[63:32]+carry-in,\n                              // may carry-out\naddc.u32        r3,0,0;       // r3 = carry-in, no carry-out\nmad.lo.cc.u32   r2,r5,r7,r2;  // r2+=(r5*r7).[31:0], may carry-out\nmadc.hi.u32     r3,r5,r7,r3;  // r3+=(r5*r7).[63:32]+carry-in\n</pre></div>\n</div>\n</section>",
                "tooltip": "Multiply two values, extract high or low half of result, and add a third value with carry-in and\n\noptional carry-out.\n\nSyntax\n\nmadc{.hi,.lo}{.cc}.type  d, a, b, c;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nMultiplies two values, extracts either the high or low part of the result, and adds a third value\n\nalong with carry-in. Writes the result to the destination register and optionally writes the\n\ncarry-out from the addition into the condition code register.\n\nSemantics\n\nt = a * b;\n\nd = t<63..32> + c + CC.CF;     // for .hi variant\n\nd = t<31..0> + c + CC.CF;      // for .lo variant\n\nif .cc specified, carry-out from addition is written to CC.CF\n\nNotes\n\nGenerally used in combination with mad.cc and addc to implement extended-precision\n\nmulti-word multiplication. See example below.\n\nPTX ISA Notes\n\n32-bit madc introduced in PTX ISA version 3.0.\n\n64-bit madc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nRequires target sm_20 or higher.\n\nExamples\n\n// extended-precision multiply:  [r3,r2,r1,r0] = [r5,r4] * [r7,r6]\n\nmul.lo.u32     r0,r4,r6;      // r0=(r4*r6).[31:0], no carry-out\n\nmul.hi.u32     r1,r4,r6;      // r1=(r4*r6).[63:32], no carry-out\n\nmad.lo.cc.u32  r1,r5,r6,r1;   // r1+=(r5*r6).[31:0], may carry-out\n\nmadc.hi.u32    r2,r5,r6,0;    // r2 =(r5*r6).[63:32]+carry-in,\n\n                              // no carry-out\n\nmad.lo.cc.u32   r1,r4,r7,r1;  // r1+=(r4*r7).[31:0], may carry-out\n\nmadc.hi.cc.u32  r2,r4,r7,r2;  // r2+=(r4*r7).[63:32]+carry-in,\n\n                              // may carry-out\n\naddc.u32        r3,0,0;       // r3 = carry-in, no carry-out\n\nmad.lo.cc.u32   r2,r5,r7,r2;  // r2+=(r5*r7).[31:0], may carry-out\n\nmadc.hi.u32     r3,r5,r7,r3;  // r3+=(r5*r7).[63:32]+carry-in\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc"
            };

        case "mapa":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa\" target=\"_blank\" rel=\"noopener noreferrer\">mapa <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: mapa</h1><section id=\"data-movement-and-conversion-instructions-mapa\">\n\n\n<p>Map the address of the shared variable in the target CTA.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mapa{.space}.type          d, a, b;\n\n// Maps shared memory address in register a into CTA b.\nmapa.shared::cluster.type  d, a, b;\n\n// Maps shared memory variable into CTA b.\nmaps.shared::cluster.type  d, sh, b;\n\n// Maps shared memory variable into CTA b.\nmaps.shared::cluster.type  d, sh + imm, b;\n\n// Maps generic address in register a into CTA b.\nmapa.type                  d, a, b;\n\n.space = { .shared::cluster }\n.type  = { .u32, .u64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Get address in the CTA specified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> which corresponds to the address specified by\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code> indicates the type of the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and the source\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When space is <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, source <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is either a shared memory variable or a register\ncontaining a valid shared memory address and register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> contains a shared memory address. When\nthe optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.space</span></code> is not specified, both <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are registers containing\ngeneric addresses pointing to shared memory.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a 32-bit integer operand representing the rank of the target CTA.</p>\n<p>Destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> will hold an address in CTA <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> corresponding to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mapa.shared::cluster.u64 d1, %reg1, cta;\nmapa.shared::cluster.u32 d2, sh, 3;\nmapa.u64                 d3, %reg2, cta;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Map the address of the shared variable in the target CTA.\n\nSyntax\n\nmapa{.space}.type          d, a, b;\n\n// Maps shared memory address in register a into CTA b.\n\nmapa.shared::cluster.type  d, a, b;\n\n// Maps shared memory variable into CTA b.\n\nmaps.shared::cluster.type  d, sh, b;\n\n// Maps shared memory variable into CTA b.\n\nmaps.shared::cluster.type  d, sh + imm, b;\n\n// Maps generic address in register a into CTA b.\n\nmapa.type                  d, a, b;\n\n.space = { .shared::cluster }\n\n.type  = { .u32, .u64 }\n\nDescription\n\nGet address in the CTA specified by operand b which corresponds to the address specified by\n\noperand a.\n\nInstruction type .type indicates the type of the destination operand d and the source\n\noperand a.\n\nWhen space is .shared::cluster, source a is either a shared memory variable or a register\n\ncontaining a valid shared memory address and register d contains a shared memory address. When\n\nthe optional qualifier .space is not specified, both a and d are registers containing\n\ngeneric addresses pointing to shared memory.\n\nb is a 32-bit integer operand representing the rank of the target CTA.\n\nDestination register d will hold an address in CTA b corresponding to operand a.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmapa.shared::cluster.u64 d1, %reg1, cta;\n\nmapa.shared::cluster.u32 d2, sh, 3;\n\nmapa.u64                 d3, %reg2, cta;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa"
            };

        case "match":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync\" target=\"_blank\" rel=\"noopener noreferrer\">match.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: match.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-match-sync\">\n\n\n<p>Broadcast and compare a value across threads in warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>match.any.sync.type  d, a, membermask;\nmatch.all.sync.type  d[|p], a, membermask;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> will cause executing thread to wait until all non-exited threads from <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>\nhave executed <code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> with the same qualifiers and same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value before resuming\nexecution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s laneid.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> performs broadcast and compare of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> across all non-exited threads in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> and sets destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and optional predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> based on mode.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has instruction type and <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>Destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a 32-bit mask where bit position in mask corresponds to thread\u2019s laneid.</p>\n<p>The matching operation modes are:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.all</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is set to mask corresponding to non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> if all non-exited\nthreads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have same value of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>; otherwise <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is set\nto 0. Optionally predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to true if all non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have\nsame value of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>; otherwise <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to false. The sink symbol \u2018_\u2019 may be used in\nplace of any one of the destination operands.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.any</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is set to mask of non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> that have same value of operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n</dd>\n</dl>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Release Notes</strong></p>\n<p>Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">match.sync</span></code> applies to threads in a single warp, not across an entire CTA.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>match.any.sync.b32    d, a, 0xffffffff;\nmatch.all.sync.b64    d|p, a, mask;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Broadcast and compare a value across threads in warp.\n\nSyntax\n\nmatch.any.sync.type  d, a, membermask;\n\nmatch.all.sync.type  d[|p], a, membermask;\n\n.type = { .b32, .b64 };\n\nDescription\n\nmatch.sync will cause executing thread to wait until all non-exited threads from membermask\n\nhave executed match.sync with the same qualifiers and same membermask value before resuming\n\nexecution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid.\n\nmatch.sync performs broadcast and compare of operand a across all non-exited threads in\n\nmembermask and sets destination d and optional predicate p based on mode.\n\nOperand a has instruction type and d has .b32 type.\n\nDestination d is a 32-bit mask where bit position in mask corresponds to thread\u2019s laneid.\n\nThe matching operation modes are:\n\n.alld is set to mask corresponding to non-exited threads in membermask if all non-exited\n\nthreads in membermask have same value of operand a; otherwise d is set\n\nto 0. Optionally predicate p is set to true if all non-exited threads in membermask have\n\nsame value of operand a; otherwise p is set to false. The sink symbol \u2018_\u2019 may be used in\n\nplace of any one of the destination operands.\n\n.anyd is set to mask of non-exited threads in membermask that have same value of operand\n\na.\n\nThe behavior of match.sync is undefined if the executing thread is not in the membermask.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_70 or higher.\n\nRelease Notes\n\nNote that match.sync applies to threads in a single warp, not across an entire CTA.\n\nExamples\n\nmatch.any.sync.b32    d, a, 0xffffffff;\n\nmatch.all.sync.b64    d|p, a, mask;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync"
            };

        case "max":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max\" target=\"_blank\" rel=\"noopener noreferrer\">max(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-max\" target=\"_blank\" rel=\"noopener noreferrer\">max(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-max\" target=\"_blank\" rel=\"noopener noreferrer\">max(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: max</h1><section id=\"floating-point-instructions-max\">\n\n\n<p>Find the maximum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>max{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\nmax.f64                            d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the maximum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the maximum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (.xorsign) {\n    xorsign = getSignBit(a) ^ getSignBit(b);\n    if (.abs) {\n        a = |a|;\n        b = |b|;\n    }\n}\nif (isNaN(a) &amp;&amp; isNaN(b))                 d = NaN;\nelse if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))  d = NaN;\nelse if (isNaN(a))                        d = b;\nelse if (isNaN(b))                        d = a;\nelse                                      d = (a &gt; b) ? a : b;\nif (.xorsign &amp;&amp; !isNaN(d)) {\n    setSignBit(d, xorsign);\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.NaN</span></code>introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.NaN</span></code>requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>max.ftz.f32  f0,f1,f2;\nmax.f64      a,b,c;\n// fp32 max with .NaN\nmax.NaN.f32  f0,f1,f2;\n// fp32 max with .xorsign.abs\nmax.xorsign.abs.f32 Rd, Ra, Rb;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: max</h1><section id=\"half-precision-floating-point-instructions-max\">\n\n\n<p>Find the maximum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>max{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\nmax{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\nmax{.NaN}{.xorsign.abs}.bf16           d, a, b;\nmax{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the maximum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction types, input vectors are formed with half-word values\nfrom source operands. Half-word operands are then processed in parallel to store <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the maximum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    if (.xorsign) {\n        xorsign = getSignBit(a) ^ getSignBit(b);\n        if (.abs) {\n            a = |a|;\n            b = |b|;\n        }\n    }\n    if (isNaN(a) &amp;&amp; isNaN(b))              d = NaN;\n    if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))    d = NaN;\n    else if (isNaN(a))                     d = b;\n    else if (isNaN(b))                     d = a;\n    else                                   d = (a &gt; b) ? a : b;\n    if (.xorsign &amp;&amp; !isNaN(d)) {\n         setSignBit(d, xorsign);\n    }\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n        if (.xorsign) {\n            xorsign = getSignBit(fA[i]) ^ getSignBit(fB[i]);\n            if (.abs) {\n                fA[i] = |fA[i]|;\n                fB[i] = |fB[i]|;\n            }\n        }\n        if (isNaN(fA[i]) &amp;&amp; isNaN(fB[i]))              d[i] = NaN;\n        if (.NaN &amp;&amp; (isNaN(fA[i]) || isNaN(fB[i])))    d[i] = NaN;\n        else if (isNaN(fA[i]))                         d[i] = fB[i];\n        else if (isNaN(fB[i]))                         d[i] = fA[i];\n        else                                           d[i] = (fA[i] &gt; fB[i]) ? fA[i] : fB[i];\n        if (.xorsign &amp;&amp; !isNaN(fA[i])) {\n            setSignBit(d[i], xorsign);\n        }\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">max.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.xorsign.abs</span></code> support requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>max.ftz.f16       h0,h1,h2;\nmax.f16x2         b0,b1,b2;\n// SIMD fp16 max with NaN\nmax.NaN.f16x2     b0,b1,b2;\n// scalar f16 max with xorsign.abs\nmax.xorsign.abs.f16 Rd, Ra, Rb;\nmax.bf16          h0, h1, h2;\n// scalar bf16 max and NaN\nmax.NaN.bf16x2    b0, b1, b2;\n// SIMD bf16 max with xorsign.abs\nmax.xorsign.abs.bf16x2 Rd, Ra, Rb;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: max</h1><section id=\"integer-arithmetic-instructions-max\">\n\n\n<p>Find the maximum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>max.atype         d, a, b;\nmax{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n           .u16x2, .s16, .s64 };\n.btype = { .s16x2, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the maximum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then processed in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> result\nin destination.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have the same type as the instruction type. For instruction types\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = (iA[i] &gt; iB[i]) ? iA[i] : iB[i];\n    }\n} else {\n    d = (a &gt; b) ? a : b; // Integer (signed and unsigned)\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Signed and unsigned differ.</p>\n<dl class=\"simple\">\n<dt>Saturation modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.relu.{s16x2,</span> <span class=\"pre\">s32}</span></code> clamps the result to 0 if negative.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">max{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max.relu.s32</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">max.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">max{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max.relu.s32</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>max.u32  d,a,b;\nmax.s32  q,q,0;\nmax.relu.s16x2 t,t,u;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\n\nmax.f64                            d, a, b;\n\nDescription\n\nStore the maximum of a and b in d.\n\nIf .NaN modifier is specified, the result is canonical NaN if either of the inputs is\n\nNaN.\n\nIf .abs modifier is specified, the magnitude of destination operand d is the maximum of\n\nabsolute values of both the input arguments.\n\nIf...\n\n=====Half Precision Floating Point Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\n\nmax{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\n\nmax{.NaN}{.xorsign.abs}.bf16           d, a, b;\n\nmax{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n\nDescription\n\nStore the maximum of a and b in d.\n\nFor .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\n\nfrom source operands. Half-word o...\n\n=====Integer Arithmetic Instructions: max\n\n\n\nFind the maximum of two values.\n\nSyntax\n\nmax.atype         d, a, b;\n\nmax{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n\n           .u16x2, .s16, .s64 };\n\n.btype = { .s16x2, .s32 };\n\nDescription\n\nStore the maximum of a and b in d.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-max"
            };

        case "maxclusterrank":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank\" target=\"_blank\" rel=\"noopener noreferrer\">maxclusterrank <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Cluster Dimension Directives: .maxclusterrank</h1><section id=\"cluster-dimension-directives-maxclusterrank\">\n\n\n<p>Declare the maximum number of CTAs that can be part of the cluster.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.maxclusterrank n\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the maximum number of thread blocks (CTAs) allowed to be part of the cluster.</p>\n<p><strong>Semantics</strong></p>\n<p>Product of the number of CTAs in each cluster dimension specified in any invocation of the kernel is\nrequired to be less or equal to that specified in this directive. Otherwise invocation will result\nin a runtime error or kernel launch failure.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxclusterrank</span></code> directive cannot be used in conjunction with the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqnctapercluster</span></code> directive.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo ..maxclusterrank 8         { . . . }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare the maximum number of CTAs that can be part of the cluster.\n\nSyntax\n\n.maxclusterrank n\n\nDescription\n\nDeclare the maximum number of thread blocks (CTAs) allowed to be part of the cluster.\n\nSemantics\n\nProduct of the number of CTAs in each cluster dimension specified in any invocation of the kernel is\n\nrequired to be less or equal to that specified in this directive. Otherwise invocation will result\n\nin a runtime error or kernel launch failure.\n\nThe .maxclusterrank directive cannot be used in conjunction with the .reqnctapercluster directive.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo ..maxclusterrank 8         { . . . }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank"
            };

        case "maxnreg":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxnreg\" target=\"_blank\" rel=\"noopener noreferrer\">maxnreg <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .maxnreg</h1><section id=\"performance-tuning-directives-maxnreg\">\n\n\n<p>Maximum number of registers that can be allocated per thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.maxnreg n\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the maximum number of registers per thread in a CTA.</p>\n<p><strong>Semantics</strong></p>\n<p>The compiler guarantees that this limit will not be exceeded. The actual number of registers used\nmay be less; for example, the backend may be able to compile to fewer registers, or the maximum\nnumber of registers may be further constrained by <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxctapersm</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo .maxnreg 16 { ... }  // max regs per thread = 16\n</pre></div>\n</div>\n</section>",
                "tooltip": "Maximum number of registers that can be allocated per thread.\n\nSyntax\n\n.maxnreg n\n\nDescription\n\nDeclare the maximum number of registers per thread in a CTA.\n\nSemantics\n\nThe compiler guarantees that this limit will not be exceeded. The actual number of registers used\n\nmay be less; for example, the backend may be able to compile to fewer registers, or the maximum\n\nnumber of registers may be further constrained by .maxntid and .maxctapersm.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxnreg 16 { ... }  // max regs per thread = 16\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxnreg"
            };

        case "maxntid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxntid\" target=\"_blank\" rel=\"noopener noreferrer\">maxntid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .maxntid</h1><section id=\"performance-tuning-directives-maxntid\">\n\n\n<p>Maximum number of threads in the thread block (CTA).</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.maxntid nx\n.maxntid nx, ny\n.maxntid nx, ny, nz\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the maximum number of threads in the thread block (CTA). This maximum is specified by giving\nthe maximum extent of each dimension of the 1D, 2D, or 3D CTA.\u00a0 The maximum number of threads is the\nproduct of the maximum extent in each dimension.</p>\n<p><strong>Semantics</strong></p>\n<p>The maximum number of threads in the thread block, computed as the product of the maximum extent\nspecified for each dimension, is guaranteed not to be exceeded in any invocation of the kernel in\nwhich this directive appears. Exceeding the maximum number of threads results in a runtime error or\nkernel launch failure.</p>\n<p>Note that this directive guarantees that the <em>total</em> number of threads does not exceed the maximum,\nbut does not guarantee that the limit in any particular dimension is not exceeded.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo .maxntid 256       { ... }  // max threads = 256\n.entry bar .maxntid 16,16,4   { ... }  // max threads = 1024\n</pre></div>\n</div>\n</section>",
                "tooltip": "Maximum number of threads in the thread block (CTA).\n\nSyntax\n\n.maxntid nx\n\n.maxntid nx, ny\n\n.maxntid nx, ny, nz\n\nDescription\n\nDeclare the maximum number of threads in the thread block (CTA). This maximum is specified by giving\n\nthe maximum extent of each dimension of the 1D, 2D, or 3D CTA.\u00a0 The maximum number of threads is the\n\nproduct of the maximum extent in each dimension.\n\nSemantics\n\nThe maximum number of threads in the thread block, computed as the product of the maximum extent\n\nspecified for each dimension, is guaranteed not to be exceeded in any invocation of the kernel in\n\nwhich this directive appears. Exceeding the maximum number of threads results in a runtime error or\n\nkernel launch failure.\n\nNote that this directive guarantees that the total number of threads does not exceed the maximum,\n\nbut does not guarantee that the limit in any particular dimension is not exceeded.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxntid 256       { ... }  // max threads = 256\n\n.entry bar .maxntid 16,16,4   { ... }  // max threads = 1024\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-maxntid"
            };

        case "mbarrier":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.arrive <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.arrive_drop <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.complete_tx <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.expect_tx <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.init <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.inval <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.pending_count <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\" target=\"_blank\" rel=\"noopener noreferrer\">mbarrier.test_wait/mbarrier.try_wait <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: mbarrier</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier\">\n\n\n<ul class=\"simple\">\n<li><p>Synchronizing any subset of threads within a CTA</p></li>\n<li><p>One-way synchronization of threads across CTAs of a cluster. As noted in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-smem\">mbarrier support with\nshared memory</a>, threads can\nperform only <em>arrive</em> operations but not <em>*_wait</em> on an mbarrier located in <code class=\"docutils literal notranslate\"><span class=\"pre\">shared::cluster</span></code>\nspace.</p></li>\n<li><p>Waiting for completion of asynchronous memory operations initiated by a thread and making them\nvisible to other threads.</p></li>\n</ul>\n<p>An <em>mbarrier object</em> is an opaque object in memory which can be initialized and invalidated using :</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.inval</span></code></p></li>\n</ul>\n<p>Operations supported on <em>mbarrier object</em>s are :</p>\n<ul class=\"simple\">\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.pending_count</span></code></p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code></p></li>\n</ul>\n<p>Performing any <em>mbarrier</em> operation except <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> on an uninitialized <em>mbarrier object</em>\nresults in undefined behavior.</p>\n<p>Unlike <code class=\"docutils literal notranslate\"><span class=\"pre\">bar{.cta}</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">barrier{.cta}</span></code> instructions which can access a limited number of barriers\nper CTA, <em>mbarrier objects</em> are used defined and are only limited by the total shared memory size\navailable.</p>\n<p><em>mbarrier</em> operations enable threads to perform useful work after the arrival at the <em>mbarrier</em> and\nbefore waiting for the <em>mbarrier</em> to complete.</p>\n<section id=\"size-and-alignment-of-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\"></span><h5><span class=\"section-number\">9.7.12.15.1. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size and alignment of mbarrier object</a><a class=\"headerlink\" href=\"#size-and-alignment-of-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>An mbarrier object is an opaque object with the following type and alignment requirements :</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 22%\"/>\n<col style=\"width: 46%\"/>\n<col style=\"width: 32%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Type</p></th>\n<th class=\"head\"><p>Alignment (bytes)</p></th>\n<th class=\"head\"><p>Memory space</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n<td><p>8</p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code></p></td>\n</tr>\n</tbody>\n</table>\n</section>\n<section id=\"contents-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-contents\"></span><h5><span class=\"section-number\">9.7.12.15.2. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a><a class=\"headerlink\" href=\"#contents-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>An opaque <em>mbarrier object</em> keeps track of the following information :</p>\n<ul class=\"simple\">\n<li><p>Current phase of the <em>mbarrier object</em></p></li>\n<li><p>Count of pending arrivals for the current phase of the <em>mbarrier object</em></p></li>\n<li><p>Count of expected arrivals for the next phase of the <em>mbarrier object</em></p></li>\n<li><p>Count of pending asynchronous memory operations (or transactions) tracked by the current phase of\nthe <em>mbarrier object</em>. This is also referred to as <em>tx-count</em>.</p></li>\n</ul>\n<p>An <em>mbarrier object</em> progresses through a sequence of phases where each phase is defined by threads\nperforming an expected number of <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperations.</p>\n<p>The valid range of each of the counts is as shown below:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 39%\"/>\n<col style=\"width: 33%\"/>\n<col style=\"width: 28%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Count name</p></th>\n<th class=\"head\"><p>Minimum value</p></th>\n<th class=\"head\"><p>Maximum value</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>Expected arrival count</p></td>\n<td><p>1</p></td>\n<td><p>2<sup>20</sup> - 1</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>Pending arrival count</p></td>\n<td><p>0</p></td>\n<td><p>2<sup>20</sup> - 1</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>tx-count</p></td>\n<td><p>-(2<sup>20</sup> - 1)</p></td>\n<td><p>2<sup>20</sup> - 1</p></td>\n</tr>\n</tbody>\n</table>\n</section>\n<section id=\"lifecycle-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-lifecycle\"></span><h5><span class=\"section-number\">9.7.12.15.3. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-lifecycle\">Lifecycle of the mbarrier object</a><a class=\"headerlink\" href=\"#lifecycle-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>The <em>mbarrier object</em> must be initialized prior to use.</p>\n<p>An <em>mbarrier object</em> is used to synchronize threads and asynchronous memory operations.</p>\n<p>An <em>mbarrier object</em> may be used to perform a sequence of such synchronizations.</p>\n<p>An <em>mbarrier object</em> must be invalidated to repurpose its memory.</p>\n</section>\n<section id=\"phase-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-phase\"></span><h5><span class=\"section-number\">9.7.12.15.4. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase\">Phase of the mbarrier object</a><a class=\"headerlink\" href=\"#phase-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>The phase of an <em>mbarrier object</em> is the number of times the <em>mbarrier object</em> has been used to\nsynchronize threads and <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a>\noperations. In each phase {0, 1, 2, \u2026}, threads perform in program order :</p>\n<ul class=\"simple\">\n<li><p><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperations to complete the current phase and</p></li>\n<li><p><em>test_wait</em> / <em>try_wait</em> operations to check for the completion of the current phase.</p></li>\n</ul>\n<p>An <em>mbarrier object</em> is automatically reinitialized upon completion of the current phase for\nimmediate use in the next phase. The current phase is incomplete and all prior phases are complete.</p>\n<p>For each phase of the mbarrier object, at least one <em>test_wait</em> or <em>try_wait</em> operation must be\nperformed which returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> for <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> before an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\nin the subsequent phase.</p>\n</section>\n<section id=\"tracking-asynchronous-operations-by-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-tracking-async-operations\"></span><h5><span class=\"section-number\">9.7.12.15.5. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-tracking-async-operations\">Tracking asynchronous operations by the mbarrier object</a><a class=\"headerlink\" href=\"#tracking-asynchronous-operations-by-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>Starting with the Hopper architecture (<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code>), <em>mbarrier object</em> supports a new count, called\n<em>tx-count</em>, which is used for tracking the completion of asynchronous memory operations or\ntransactions. <em>tx-count</em> tracks the number of asynchronous transactions, in units specified by the\nasynchronous memory operation, that are outstanding and yet to be complete.</p>\n<p>The <em>tx-count</em> of an <em>mbarrier object</em> must be set to the total amount of asynchronous memory\noperations, in units as specified by the asynchronous operations, to be tracked by the current\nphase. Upon completion of each of the asynchronous operations, the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation will be performed on the <em>mbarrier object</em> and thus progress the mbarrier towards the\ncompletion of the current phase.</p>\n<section id=\"expect-tx-operation\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\"></span><h6><span class=\"section-number\">9.7.12.15.5.1. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx operation</a><a class=\"headerlink\" href=\"#expect-tx-operation\" title=\"Permalink to this headline\">\uf0c1</a></h6>\n<p>The <em>expect-tx</em> operation, with an <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code> argument, increases the <em>tx-count</em> of an\n<em>mbarrier object</em> by the value specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code>. This makes the current phase of the\n<em>mbarrier object</em> to expect and track the completion of additional asynchronous transactions.</p>\n</section>\n<section id=\"complete-tx-operation\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\"></span><h6><span class=\"section-number\">9.7.12.15.5.2. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx operation</a><a class=\"headerlink\" href=\"#complete-tx-operation\" title=\"Permalink to this headline\">\uf0c1</a></h6>\n<p>The <em>complete-tx</em> operation, with an <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument, on an <em>mbarrier object</em> consists of the following:</p>\n<dl class=\"simple\">\n<dt>mbarrier signaling</dt><dd><p>Signals the completion of asynchronous transactions that were tracked by the current phase. As a\nresult of this, <em>tx-count</em> is decremented by <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code>.</p>\n</dd>\n<dt>mbarrier potentially completing the current phase</dt><dd><p>If the current phase has been completed then the mbarrier transitions to the next phase. Refer to\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\">Phase Completion of the mbarrier object</a>\nfor details on phase completion requirements and phase transition process.</p>\n</dd>\n</dl>\n</section>\n</section>\n<section id=\"phase-completion-of-the-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\"></span><h5><span class=\"section-number\">9.7.12.15.6. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\">Phase Completion of the mbarrier object</a><a class=\"headerlink\" href=\"#phase-completion-of-the-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>The requirements for completion of the current phase are described below. Upon completion of the\ncurrent phase, the phase transitions to the subsequent phase as described below.</p>\n<dl class=\"simple\">\n<dt>Current phase completion requirements</dt><dd><p>An <em>mbarrier object</em> completes the current phase when all of the following conditions are met:</p>\n<ul class=\"simple\">\n<li><p>The count of the pending arrivals has reached zero.</p></li>\n<li><p>The <em>tx-count</em> has reached zero.</p></li>\n</ul>\n</dd>\n<dt>Phase transition</dt><dd><p>When an <em>mbarrier</em> object completes the current phase, the following actions are performed\natomically:</p>\n<ul class=\"simple\">\n<li><p>The <em>mbarrier object</em> transitions to the next phase.</p></li>\n<li><p>The pending arrival count is reinitialized to the expected arrival count.</p></li>\n</ul>\n</dd>\n</dl>\n</section>\n<section id=\"arrive-on-operation-on-mbarrier-object\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\"></span><h5><span class=\"section-number\">9.7.12.15.7. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">Arrive-on operation on mbarrier object</a><a class=\"headerlink\" href=\"#arrive-on-operation-on-mbarrier-object\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>An <em>arrive-on</em> operation, with an optional <em>count</em> argument, on an <em>mbarrier object</em> consists of the\nfollowing 2 steps :</p>\n<ul>\n<li><p>mbarrier signalling:</p>\n<p>Signals the arrival of the executing thread OR completion of the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async</span></code> instruction which\nsignals the arrive-on operation initiated by the executing thread on the <em>mbarrier object</em>. As a\nresult of this, the pending arrival count is decremented by <em>count</em>. If the <em>count</em> argument is\nnot specified, then it defaults to 1.</p>\n</li>\n<li><p>mbarrier potentially completing the current phase:</p>\n<p>If the current phase has been completed then the mbarrier transitions to the next phase. Refer to\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-phase-completion\">Phase Completion of the mbarrier object</a>\nfor details on phase completion requirements and phase transition process.</p>\n</li>\n</ul>\n</section>\n<section id=\"mbarrier-support-with-shared-memory\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-smem\"></span><h5><span class=\"section-number\">9.7.12.15.8. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-smem\">mbarrier support with shared memory</a><a class=\"headerlink\" href=\"#mbarrier-support-with-shared-memory\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p>The following table summarizes the support of various mbarrier operations on <em>mbarrier objects</em>\nlocated at different shared memory locations:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 35%\"/>\n<col style=\"width: 23%\"/>\n<col style=\"width: 42%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>mbarrier operations</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code></p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code></p></td>\n<td><p>Supported</p></td>\n<td><p>Supported, cannot return result</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code></p></td>\n<td><p>Supported</p></td>\n<td><p>Supported</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code></p></td>\n<td><p>Supported</p></td>\n<td><p>Supported</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>Other mbarrier operations</p></td>\n<td><p>Supported</p></td>\n<td><p>Not supported</p></td>\n</tr>\n</tbody>\n</table>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-init\">\n<h5><span class=\"section-number\">9.7.12.15.9. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-init\">Parallel Synchronization and Communication Instructions: mbarrier.init</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-init\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.init</strong></p>\n<p>Initialize the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.init{.shared{::cta}}.b64 [addr], count;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> initializes the <em>mbarrier object</em> at the location specified by the address operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> with the unsigned 32-bit integer <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. The value of operand count must be in the range\nas specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>.</p>\n<p>Initialization of the <em>mbarrier object</em> involves :</p>\n<ul class=\"simple\">\n<li><p>Initializing the current phase to 0.</p></li>\n<li><p>Initializing the expected arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the pending arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the <em>tx-count</em> to 0.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.shared .b64 shMem, shMem2;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n\ncvta.shared.u64          addr, shMem2;\nmbarrier.init.b64        [addr],   %r1;\nbar.cta.sync             0;\n// ... other mbarrier operations on addr\n\nmbarrier.init.shared::cta.b64 [shMem], 12;\nbar.sync                 0;\n// ... other mbarrier operations on shMem\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-inval\">\n<h5><span class=\"section-number\">9.7.12.15.10. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-inval\">Parallel Synchronization and Communication Instructions: mbarrier.inval</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-inval\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.inval</strong></p>\n<p>Invalidates the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.inval{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.inval</span></code> invalidates the <em>mbarrier object</em> at the location specified by the address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p>An <em>mbarrier object</em> must be invalidated before using its memory location for any other purpose.</p>\n<p>Performing any <em>mbarrier</em> operation except <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> on an invalidated mbarrier object\nresults in undefined behaviour.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.shared .b64 shmem;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n.reg    .pred t0;\n\n// Example 1 :\nbar.sync                      0;\n@t0 mbarrier.init.b64     [addr], %r1;\n// ... other mbarrier operations on addr\nbar.sync                      0;\n@t0 mbarrier.inval.b64    [addr];\n\n\n// Example 2 :\nbar.cta.sync                  0;\nmbarrier.init.shared.b64           [shmem], 12;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared.b64      [shmem];\n\n// shmem can be reused here for unrelated use :\nbar.cta.sync                  0;\nst.shared.b64                      [shmem], ...;\n\n// shmem can be re-initialized as mbarrier object :\nbar.cta.sync                  0;\n@t0 mbarrier.init.shared.b64       [shmem], 24;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared::cta.b64 [shmem];\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\">\n<h5><span class=\"section-number\">9.7.12.15.11. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\">Parallel Synchronization and Communication Instructions: mbarrier.expect_tx</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.expect_tx</strong></p>\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a> operation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code> argument to the\n<em>expect-tx</em> operation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.expect_tx.b64                       [addr], 32;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj1], 512;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj2], 512;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\">\n<h5><span class=\"section-number\">9.7.12.15.12. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\">Parallel Synchronization and Communication Instructions: mbarrier.complete_tx</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.complete_tx</strong></p>\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> performs a <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument to the\n<em>complete-tx</em> operation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> does not involve any asynchronous memory operations and only simulates the\ncompletion of an asynchronous memory operation and its side effect of signaling to the <em>mbarrier\nobject</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.complete_tx.b64             [addr],     32;\nmbarrier.complete_tx.shared.b64      [mbarObj1], 512;\nmbarrier.complete_tx.relaxed.cta.b64 [addr2],    32;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive\">\n<h5><span class=\"section-number\">9.7.12.15.13. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive\">Parallel Synchronization and Communication Instructions: mbarrier.arrive</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.arrive</strong></p>\n<p>Performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\nmbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\non the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The 32-bit\nunsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not cause the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code> to\ncomplete its current phase, otherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an opaque\n64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> in the\ndestination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state.</span></code> Contents of the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand for\nsuch cases.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it\ndefaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the\nmbarrier resides.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sink symbol \u2018_\u2019 as the destination operand is introduced in PTX ISA version 7.1.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 cnt, remoteAddr32, remoteCTAId, addr32;\n.reg .b64 %r&lt;3&gt;, addr, remoteAddr64;\n.shared .b64 shMem, shMem2;\n\ncvta.shared.u64            addr, shMem2;\nmov.b32                    addr32, shMem2;\nmapa.shared::cluster.u32   remoteAddr32, addr32, remoteCTAId;\nmapa.u64                   remoteAddr64, addr,   remoteCTAId;\n\ncvta.shared.u64          addr, shMem2;\n\nmbarrier.arrive.shared.b64                       %r0, [shMem];\nmbarrier.arrive.shared::cta.b64                  %r0, [shMem2];\nmbarrier.arrive.release.cta.shared::cluster.b64  _, [remoteAddr32];\nmbarrier.arrive.release.cluster.b64              _, [remoteAddr64], cnt;\nmbarrier.arrive.expect_tx.release.cluster.b64    _, [remoteAddr64], tx_count;\nmbarrier.arrive.noComplete.b64                   %r1, [addr], 2;\nmbarrier.arrive.b64                              %r2, [addr], cnt;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\">\n<h5><span class=\"section-number\">9.7.12.15.14. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\">Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.arrive_drop</strong></p>\n<p>Decrements the expected count of the <em>mbarrier object</em> and performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\nmbarrier.arrive_drop.expect_tx{.shared::cluster}{.sem}{.scope}.b64   _, [addr], tx_count;\nmbarrier.arrive_drop.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state,  [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> on the <em>mbarrier object</em> at the location specified by\nthe address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> performs the following steps:</p>\n<ul class=\"simple\">\n<li><p>Decrements the expected arrival count of the <em>mbarrier object</em> by the value specified by the\n32-bit integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. If <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> operand is not specified, it defaults to 1.</p></li>\n<li><p>Performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>.</p></li>\n</ul>\n<p>The decrement done in the expected arrivals count of the <em>mbarrier object</em> will be for all the\nsubsequent phases of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation forms the <em>release</em> pattern as described in the Memory\nConsistency Model and synchronizes with the <em>acquire</em> patterns.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code>\ninstruction can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the mbarrier\nresides.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not complete the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier,</span></code>\notherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p>A thread that wants to either exit or opt out of participating in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> can use\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> to drop itself from the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an\nopaque 64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>\nin the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>. Contents of the returned state are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier</em> object located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not\nin <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand\nfor such cases.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 cnt;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\n// Example 1\n@p mbarrier.arrive_drop.shared.b64 _, [shMem];\n@p exit;\n@p2 mbarrier.arrive_drop.noComplete.shared.b64 _, [shMem], %a;\n@p2 exit;\n..\n@!p mbarrier.arrive.shared.b64   %r1, [shMem];\n@!p mbarrier.test_wait.shared.b64  q, [shMem], %r1;\n\n// Example 2\nmbarrier.arrive_drop.shared::cluster.b64 _, [addr];\nmbarrier.arrive_drop.shared::cta.release.cluster.b64     _, [addr], cnt;\n\n// Example 3\nmbarrier.arrive_drop.expect_tx.shared::cta.release.cta.b64 state, [addr], tx_count;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\">\n<h5><span class=\"section-number\">9.7.12.15.15. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\">Parallel Synchronization and Communication Instructions: cp.async.mbarrier.arrive</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>cp.async.mbarrier.arrive</strong></p>\n<p>Makes the <em>mbarrier object</em> track all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Causes an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> to be\ntriggered by the system on the <em>mbarrier object</em> upon the completion of all prior <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations initiated by the\nexecuting thread. The <em>mbarrier object</em> is at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> is\nasynchronous to execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is not specified, the pending count of the mbarrier object is incremented\nby 1 prior to the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>. This\nresults in a zero-net change for the pending count from the asynchronous <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\nduring the current phase. The pending count of the <em>mbarrier object</em> after the increment should not\nexceed the limit as mentioned in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>. Otherwise,\nthe behavior is undefined.</p>\n<p>When the <code class=\"docutils literal notranslate\"><span class=\"pre\">.noinc</span></code> modifier is specified, the increment to the pending count of the <em>mbarrier\nobject</em> is not performed. Hence the decrement of the pending count done by the asynchronous\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> must be\naccounted for in the initialization of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example 1: no .noinc\nmbarrier.init.shared.b64 [shMem], threadCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n....\n// Absence of .noinc accounts for arrive-on from completion of prior cp.async operations.\n// So mbarrier.init must only account for arrive-on from mbarrier.arrive.\ncp.async.mbarrier.arrive.shared.b64 [shMem];\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n\n\n\n// Example 2: with .noinc\n\n// Tracks arrive-on from mbarrier.arrive and cp.async.mbarrier.arrive.\n\n// All threads participating in the mbarrier perform cp.async\nmov.b32 copyOperationCnt, threadCount;\n\n// 3 arrive-on operations will be triggered per-thread\nmul.lo.u32 copyArrivalCnt, copyOperationCnt, 3;\n\nadd.u32 totalCount, threadCount, copyArrivalCnt;\n\nmbarrier.init.shared.b64 [shMem], totalCount;\n....\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n...\n// Presence of .noinc requires mbarrier initalization to have accounted for arrive-on from cp.async\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 1st instance\n....\ncp.async.ca.shared.global [shard3], [gbl3], 4;\ncp.async.ca.shared.global [shard4], [gbl4], 16;\ncp.async.mbarrier.arrive.noinc.shared::cta.b64 [shMem]; // 2nd instance\n....\ncp.async.ca.shared.global [shard5], [gbl5], 4;\ncp.async.cg.shared.global [shard6], [gbl6], 16;\ncp.async.mbarrier.arrive.noinc.shared.b64 [shMem]; // 3rd and last instance\n....\nmbarrier.arrive.shared.b64 state, [shMem];\n\nwaitLoop:\nmbarrier.test_wait.shared.b64 p, [shMem], state;\n@!p bra waitLoop;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\"></span><h5><span class=\"section-number\">9.7.12.15.16. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\">Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.test_wait/mbarrier.try_wait</strong></p>\n<p>Checks whether the <em>mbarrier object</em> has completed the phase.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.test_wait{.sem}{.scope}{.shared{::cta}}.b64        waitComplete, [addr], state;\nmbarrier.test_wait.parity{.sem}{.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity;\n\nmbarrier.try_wait{.sem}{.scope}{.shared{::cta}}.b64         waitComplete, [addr], state\n                                                               {, suspendTimeHint};\n\nmbarrier.try_wait{.sem}{.scope}.parity{.shared{::cta}}.b64  waitComplete, [addr], phaseParity\n                                                               {, suspendTimeHint};\n\n.sem   = { .acquire }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations test for the completion of the current or the immediately\npreceding phase of an <em>mbarrier object</em> at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> is a non-blocking instruction which tests for the completion of the phase.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> is a potentially blocking instruction which tests for the completion of the\nphase. If the phase is not complete, the executing thread may be suspended. Suspended thread resumes\nexecution when the specified phase completes OR before the phase completes following a\nsystem-dependent time limit. The optional 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">suspendTimeHint</span></code>\nspecifies the time limit, in nanoseconds, that may be used for the time limit instead of the\nsystem-dependent limit.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> test for completion of the phase :</p>\n<ul class=\"simple\">\n<li><p>Specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>, which was returned by an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> instruction on\nthe same <em>mbarrier object</em> during the current or the immediately preceding phase. Or</p></li>\n<li><p>Indicated by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase\nor the immediately preceding phase of the <em>mbarrier object</em>.</p></li>\n</ul>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variant of the instructions test for the completion of the phase indicated by the\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase or the immediately\npreceding phase of the <em>mbarrier object</em>. An even phase has integer parity 0 and an odd phase has\ninteger parity of 1. So the valid values of <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code> operand are 0 and 1.</p>\n<p>Note: the use of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variants of the instructions requires tracking the phase of an\n<em>mbarrier object</em> throughout its lifetime.</p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations are valid only for :</p>\n<ul class=\"simple\">\n<li><p>the current incomplete phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p></li>\n<li><p>the immediately preceding phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> operations return <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>, they form the\n<em>acquire</em> pattern as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> instructions can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not\nspecified then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state\nspace where the mbarrier resides.</p>\n<p>The following ordering of memory operations hold for the executing thread when\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> :</p>\n<ol class=\"arabic simple\">\n<li><p>All memory accesses (except <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">async operations</a> ) requested prior, in program\norder, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads of the CTA\nare performed and are visible to the executing thread.</p></li>\n<li><p>All <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations\nrequested prior, in program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code> during the completed phase by\nthe participating threads of the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> asynchronous operations using the same <em>mbarrier object</em> requested prior,\nin program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads\nof the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All memory accesses requested after the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code>, in\nprogram order, are not performed and not visible to memory accesses performed prior to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code>, in program order, by other threads participating in the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p></li>\n<li><p>There is no ordering and visibility guarantee for memory accesses requested by the thread after\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> and prior to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code>, in program order.</p></li>\n</ol>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> is introduced in PTX ISA version 7.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example 1a, thread synchronization with test_wait:\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.test_wait.shared.b64    complete, [shMem], %r1;\n@!complete nanosleep.u32 20;\n@!complete bra waitLoop;\n\n// Example 1b, thread synchronization with try_wait :\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.try_wait.shared.b64    complete, [shMem], %r1;\n@!complete bra waitLoop;\n\n\n// Example 2, thread synchronization using phase parity :\n\n.reg .b32 i, parArg;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmov.b32 i, 0;\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nloopStart :                           // One phase per loop iteration\n    ...\n    mbarrier.arrive.shared.b64  %r1, [shMem]; // N threads\n    ...\n    and.b32 parArg, i, 1;\n    waitLoop:\n    mbarrier.test_wait.parity.shared.b64  complete, [shMem], parArg;\n    @!complete nanosleep.u32 20;\n    @!complete bra waitLoop;\n    ...\n    add.u32 i, i, 1;\n    setp.lt.u32 p, i, IterMax;\n@p bra loopStart;\n\n\n// Example 3, Asynchronous copy completion waiting :\n\n.reg .b64 state;\n.shared .b64 shMem2;\n.shared .b64 shard1, shard2;\n.global .b64 gbl1, gbl2;\n\nmbarrier.init.shared.b64 [shMem2], threadCount;\n...\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n\n// Absence of .noinc accounts for arrive-on from prior cp.async operation\ncp.async.mbarrier.arrive.shared.b64 [shMem2];\n...\nmbarrier.arrive.shared.b64 state, [shMem2];\n\nwaitLoop:\nmbarrier.test_wait.shared::cta.b64 p, [shMem2], state;\n@!p bra waitLoop;\n\n// Example 4, Synchronizing the CTA0 threads with cluster threads\n.reg .b64 %r1, addr, remAddr;\n.shared .b64 shMem;\n\ncvta.shared.u64          addr, shMem;\nmapa.u64                 remAddr, addr, 0;     // CTA0\u2019s shMem instance\n\n// One thread from CTA0 executing the below initialization operation\n@p0 mbarrier.init.shared::cta.b64 [shMem], N;  // N = no of cluster threads\n\nbarrier.cluster.arrive;\nbarrier.cluster.wait;\n\n// Entire cluster executing the below arrive operation\nmbarrier.arrive.release.cluster.b64              _, [remAddr];\n\n// computation not requiring mbarrier synchronization ...\n\n// Only CTA0 threads executing the below wait operation\nwaitLoop:\nmbarrier.try_wait.parity.acquire.cluser.shared::cta.b64  complete, [shMem], 0;\n@!complete bra waitLoop;\n</pre></div>\n</div>\n</section>\n<section id=\"parallel-synchronization-and-communication-instructions-mbarrier-pending-count\">\n<h5><span class=\"section-number\">9.7.12.15.17. </span><a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-pending-count\">Parallel Synchronization and Communication Instructions: mbarrier.pending_count</a><a class=\"headerlink\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-pending-count\" title=\"Permalink to this headline\">\uf0c1</a></h5>\n<p><strong>mbarrier.pending_count</strong></p>\n<p>Query the pending arrival count from the opaque mbarrier state.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.pending_count.b64 count, state;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The pending count can be queried from the opaque mbarrier state using <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.pending_count</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand is a 64-bit register that must be the result of a prior\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive.noComplete</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop.noComplete</span></code> instruction. Otherwise, the\nbehavior is undefined.</p>\n<p>The destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is a 32-bit unsigned integer representing the pending count of\nthe <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> from\nwhich the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> register was obtained.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r1;\n.reg .b64 state;\n.shared .b64 shMem;\n\nmbarrier.arrive.noComplete.b64 state, [shMem], 1;\nmbarrier.pending_count.b64 %r1, state;\n</pre></div>\n</div>\n</section>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.arrive</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive\">\n\n\n<p>Performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\nmbarrier.arrive.noComplete{.sem}{.cta}{.shared{::cta}}.b64  state, [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a> operation\non the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The 32-bit\nunsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not cause the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code> to\ncomplete its current phase, otherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an opaque\n64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> in the\ndestination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state.</span></code> Contents of the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand for\nsuch cases.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it\ndefaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the\nmbarrier resides.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sink symbol \u2018_\u2019 as the destination operand is introduced in PTX ISA version 7.1.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 cnt, remoteAddr32, remoteCTAId, addr32;\n.reg .b64 %r&lt;3&gt;, addr, remoteAddr64;\n.shared .b64 shMem, shMem2;\n\ncvta.shared.u64            addr, shMem2;\nmov.b32                    addr32, shMem2;\nmapa.shared::cluster.u32   remoteAddr32, addr32, remoteCTAId;\nmapa.u64                   remoteAddr64, addr,   remoteCTAId;\n\ncvta.shared.u64          addr, shMem2;\n\nmbarrier.arrive.shared.b64                       %r0, [shMem];\nmbarrier.arrive.shared::cta.b64                  %r0, [shMem2];\nmbarrier.arrive.release.cta.shared::cluster.b64  _, [remoteAddr32];\nmbarrier.arrive.release.cluster.b64              _, [remoteAddr64], cnt;\nmbarrier.arrive.expect_tx.release.cluster.b64    _, [remoteAddr64], tx_count;\nmbarrier.arrive.noComplete.b64                   %r1, [addr], 2;\nmbarrier.arrive.b64                              %r2, [addr], cnt;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop\">\n\n\n<p>Decrements the expected count of the <em>mbarrier object</em> and performs <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\nmbarrier.arrive_drop.expect_tx{.shared::cluster}{.sem}{.scope}.b64   _, [addr], tx_count;\nmbarrier.arrive_drop.noComplete{.sem}{.cta}{.shared{::cta}}.b64 state,  [addr], count;\n\n.sem   = { .release }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> on the <em>mbarrier object</em> at the location specified by\nthe address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> performs the following steps:</p>\n<ul class=\"simple\">\n<li><p>Decrements the expected arrival count of the <em>mbarrier object</em> by the value specified by the\n32-bit integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. If <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> operand is not specified, it defaults to 1.</p></li>\n<li><p>Performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> on the\n<em>mbarrier object</em>. The operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> specifies the <em>count</em> argument to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>.</p></li>\n</ul>\n<p>The decrement done in the expected arrivals count of the <em>mbarrier object</em> will be for all the\nsubsequent phases of the <em>mbarrier object</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>The optional qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> specifies that an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation is performed prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on</a>\noperation. The 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <em>expectCount</em> argument to\nthe <em>expect-tx</em> operation. When both qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.arrive</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> are specified, then\nthe count argument of the <em>arrive-on</em> operation is assumed to be 1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation forms the <em>release</em> pattern as described in the Memory\nConsistency Model and synchronizes with the <em>acquire</em> patterns.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code>\ninstruction can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified then it defaults\nto <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state space where the mbarrier\nresides.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> qualifier must not complete the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier,</span></code>\notherwise the behavior is undefined.</p>\n<p>The value of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> must be in the range as specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier\nobject</a>.</p>\n<p>Note: for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, when the argument <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is specified, the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> is\nrequired.</p>\n<p>A thread that wants to either exit or opt out of participating in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> can use\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> to drop itself from the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier object</em> located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> returns an\nopaque 64-bit register capturing the phase of the <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on\noperation</a>\nin the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>. Contents of the returned state are implementation\nspecific. Optionally, sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> can be used for the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop</span></code> operation on an <em>mbarrier</em> object located in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> but not\nin <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> cannot return a value. Sink symbol \u2018_\u2019 is mandatory for the destination operand\nfor such cases.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> introduced in PTX ISA version\n7.8.</p>\n<p>Support for qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> is introduced in PTX ISA version 8.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> introduced in PTX ISA version 8.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> argument without the modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.noComplete</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.expect_tx</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 cnt;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\n// Example 1\n@p mbarrier.arrive_drop.shared.b64 _, [shMem];\n@p exit;\n@p2 mbarrier.arrive_drop.noComplete.shared.b64 _, [shMem], %a;\n@p2 exit;\n..\n@!p mbarrier.arrive.shared.b64   %r1, [shMem];\n@!p mbarrier.test_wait.shared.b64  q, [shMem], %r1;\n\n// Example 2\nmbarrier.arrive_drop.shared::cluster.b64 _, [addr];\nmbarrier.arrive_drop.shared::cta.release.cluster.b64     _, [addr], cnt;\n\n// Example 3\nmbarrier.arrive_drop.expect_tx.shared::cta.release.cta.b64 state, [addr], tx_count;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.complete_tx</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-complete-tx\">\n\n\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> performs a <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument to the\n<em>complete-tx</em> operation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.complete_tx</span></code> does not involve any asynchronous memory operations and only simulates the\ncompletion of an asynchronous memory operation and its side effect of signaling to the <em>mbarrier\nobject</em>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.complete_tx.b64             [addr],     32;\nmbarrier.complete_tx.shared.b64      [mbarObj1], 512;\nmbarrier.complete_tx.relaxed.cta.b64 [addr2],    32;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.expect_tx</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-expect-tx\">\n\n\n<p>Perfoms <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a> operation on the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n.scope = { .cta, .cluster }\n.space = { .shared{::cta}, .shared::cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A thread executing <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.expect_tx</span></code> performs an <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-expect-tx-operation\">expect-tx</a>\noperation on the <em>mbarrier object</em> at the location specified by the address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>. The\n32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">txCount</span></code> specifies the <code class=\"docutils literal notranslate\"><span class=\"pre\">expectCount</span></code> argument to the\n<em>expect-tx</em> operation.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> are as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>This operation does not provide any memory ordering semantics and thus is a <em>relaxed</em> operation.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.expect_tx.b64                       [addr], 32;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj1], 512;\nmbarrier.expect_tx.relaxed.cta.shared.b64    [mbarObj2], 512;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.init</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-init\">\n\n\n<p>Initialize the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.init{.shared{::cta}}.b64 [addr], count;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> initializes the <em>mbarrier object</em> at the location specified by the address operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> with the unsigned 32-bit integer <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>. The value of operand count must be in the range\nas specified in <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-contents\">Contents of the mbarrier object</a>.</p>\n<p>Initialization of the <em>mbarrier object</em> involves :</p>\n<ul class=\"simple\">\n<li><p>Initializing the current phase to 0.</p></li>\n<li><p>Initializing the expected arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the pending arrival count to <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code>.</p></li>\n<li><p>Initializing the <em>tx-count</em> to 0.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.shared .b64 shMem, shMem2;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n\ncvta.shared.u64          addr, shMem2;\nmbarrier.init.b64        [addr],   %r1;\nbar.cta.sync             0;\n// ... other mbarrier operations on addr\n\nmbarrier.init.shared::cta.b64 [shMem], 12;\nbar.sync                 0;\n// ... other mbarrier operations on shMem\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.inval</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-inval\">\n\n\n<p>Invalidates the <em>mbarrier object</em>.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.inval{.shared{::cta}}.b64 [addr];\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.inval</span></code> invalidates the <em>mbarrier object</em> at the location specified by the address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p>An <em>mbarrier object</em> must be invalidated before using its memory location for any other purpose.</p>\n<p>Performing any <em>mbarrier</em> operation except <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> on an invalidated mbarrier object\nresults in undefined behaviour.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.shared .b64 shmem;\n.reg    .b64 addr;\n.reg    .b32 %r1;\n.reg    .pred t0;\n\n// Example 1 :\nbar.sync                      0;\n@t0 mbarrier.init.b64     [addr], %r1;\n// ... other mbarrier operations on addr\nbar.sync                      0;\n@t0 mbarrier.inval.b64    [addr];\n\n\n// Example 2 :\nbar.cta.sync                  0;\nmbarrier.init.shared.b64           [shmem], 12;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared.b64      [shmem];\n\n// shmem can be reused here for unrelated use :\nbar.cta.sync                  0;\nst.shared.b64                      [shmem], ...;\n\n// shmem can be re-initialized as mbarrier object :\nbar.cta.sync                  0;\n@t0 mbarrier.init.shared.b64       [shmem], 24;\n// ... other mbarrier operations on shmem\nbar.cta.sync                  0;\n@t0 mbarrier.inval.shared::cta.b64 [shmem];\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.pending_count</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-pending-count\">\n\n\n<p>Query the pending arrival count from the opaque mbarrier state.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.pending_count.b64 count, state;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The pending count can be queried from the opaque mbarrier state using <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.pending_count</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> operand is a 64-bit register that must be the result of a prior\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive.noComplete</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive_drop.noComplete</span></code> instruction. Otherwise, the\nbehavior is undefined.</p>\n<p>The destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">count</span></code> is a 32-bit unsigned integer representing the pending count of\nthe <em>mbarrier object</em> prior to the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-arrive-on\">arrive-on operation</a> from\nwhich the <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code> register was obtained.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r1;\n.reg .b64 state;\n.shared .b64 shMem;\n\nmbarrier.arrive.noComplete.b64 state, [shMem], 1;\nmbarrier.pending_count.b64 %r1, state;\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait</h1><section id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait\">\n<span id=\"parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait\"></span>\n\n<p>Checks whether the <em>mbarrier object</em> has completed the phase.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mbarrier.test_wait{.sem}{.scope}{.shared{::cta}}.b64        waitComplete, [addr], state;\nmbarrier.test_wait.parity{.sem}{.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity;\n\nmbarrier.try_wait{.sem}{.scope}{.shared{::cta}}.b64         waitComplete, [addr], state\n                                                               {, suspendTimeHint};\n\nmbarrier.try_wait{.sem}{.scope}.parity{.shared{::cta}}.b64  waitComplete, [addr], phaseParity\n                                                               {, suspendTimeHint};\n\n.sem   = { .acquire }\n.scope = { .cta, .cluster }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations test for the completion of the current or the immediately\npreceding phase of an <em>mbarrier object</em> at the location specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> is a non-blocking instruction which tests for the completion of the phase.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> is a potentially blocking instruction which tests for the completion of the\nphase. If the phase is not complete, the executing thread may be suspended. Suspended thread resumes\nexecution when the specified phase completes OR before the phase completes following a\nsystem-dependent time limit. The optional 32-bit unsigned integer operand <code class=\"docutils literal notranslate\"><span class=\"pre\">suspendTimeHint</span></code>\nspecifies the time limit, in nanoseconds, that may be used for the time limit instead of the\nsystem-dependent limit.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> test for completion of the phase :</p>\n<ul class=\"simple\">\n<li><p>Specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">state</span></code>, which was returned by an <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> instruction on\nthe same <em>mbarrier object</em> during the current or the immediately preceding phase. Or</p></li>\n<li><p>Indicated by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase\nor the immediately preceding phase of the <em>mbarrier object</em>.</p></li>\n</ul>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variant of the instructions test for the completion of the phase indicated by the\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code>, which is the integer parity of either the current phase or the immediately\npreceding phase of the <em>mbarrier object</em>. An even phase has integer parity 0 and an odd phase has\ninteger parity of 1. So the valid values of <code class=\"docutils literal notranslate\"><span class=\"pre\">phaseParity</span></code> operand are 0 and 1.</p>\n<p>Note: the use of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> variants of the instructions requires tracking the phase of an\n<em>mbarrier object</em> throughout its lifetime.</p>\n<p>The <em>test_wait</em> and <em>try_wait</em> operations are valid only for :</p>\n<ul class=\"simple\">\n<li><p>the current incomplete phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p></li>\n<li><p>the immediately preceding phase, for which <code class=\"docutils literal notranslate\"><span class=\"pre\">waitComplete</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p></li>\n</ul>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> does not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space then the behavior is undefined.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>. Alignment for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">addr</span></code> is as described in the <a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-size-alignment\">Size\nand alignment of mbarrier object</a>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> operations return <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>, they form the\n<em>acquire</em> pattern as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier indicates the set of threads that the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> instructions can directly synchronize. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not\nspecified then it defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.cta</span></code>. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::&lt;scope&gt;</span></code> indicates the state\nspace where the mbarrier resides.</p>\n<p>The following ordering of memory operations hold for the executing thread when\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> :</p>\n<ol class=\"arabic simple\">\n<li><p>All memory accesses (except <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">async operations</a> ) requested prior, in program\norder, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads of the CTA\nare performed and are visible to the executing thread.</p></li>\n<li><p>All <a class=\"reference external\" href=\"#data-movement-and-conversion-instructions-cp-async\">cp.async</a> operations\nrequested prior, in program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.mbarrier.arrive</span></code> during the completed phase by\nthe participating threads of the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk</span></code> asynchronous operations using the same <em>mbarrier object</em> requested prior,\nin program order, to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> during the completed phase by the participating threads\nof the CTA are performed and made visible to the executing thread.</p></li>\n<li><p>All memory accesses requested after the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code>, in\nprogram order, are not performed and not visible to memory accesses performed prior to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code>, in program order, by other threads participating in the <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier</span></code>.</p></li>\n<li><p>There is no ordering and visibility guarantee for memory accesses requested by the thread after\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.arrive</span></code> and prior to <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code>, in program order.</p></li>\n</ol>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.parity</span></code> is introduced in PTX ISA version 7.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifiers introduced in PTX ISA version 8.0</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.test_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.try_wait</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Example 1a, thread synchronization with test_wait:\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.test_wait.shared.b64    complete, [shMem], %r1;\n@!complete nanosleep.u32 20;\n@!complete bra waitLoop;\n\n// Example 1b, thread synchronization with try_wait :\n\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nmbarrier.arrive.shared.b64  %r1, [shMem]; // N threads executing mbarrier.arrive\n\n// computation not requiring mbarrier synchronization...\n\nwaitLoop:\nmbarrier.try_wait.shared.b64    complete, [shMem], %r1;\n@!complete bra waitLoop;\n\n\n// Example 2, thread synchronization using phase parity :\n\n.reg .b32 i, parArg;\n.reg .b64 %r1;\n.shared .b64 shMem;\n\nmov.b32 i, 0;\nmbarrier.init.shared.b64 [shMem], N;  // N threads participating in the mbarrier.\n...\nloopStart :                           // One phase per loop iteration\n    ...\n    mbarrier.arrive.shared.b64  %r1, [shMem]; // N threads\n    ...\n    and.b32 parArg, i, 1;\n    waitLoop:\n    mbarrier.test_wait.parity.shared.b64  complete, [shMem], parArg;\n    @!complete nanosleep.u32 20;\n    @!complete bra waitLoop;\n    ...\n    add.u32 i, i, 1;\n    setp.lt.u32 p, i, IterMax;\n@p bra loopStart;\n\n\n// Example 3, Asynchronous copy completion waiting :\n\n.reg .b64 state;\n.shared .b64 shMem2;\n.shared .b64 shard1, shard2;\n.global .b64 gbl1, gbl2;\n\nmbarrier.init.shared.b64 [shMem2], threadCount;\n...\ncp.async.ca.shared.global [shard1], [gbl1], 4;\ncp.async.cg.shared.global [shard2], [gbl2], 16;\n\n// Absence of .noinc accounts for arrive-on from prior cp.async operation\ncp.async.mbarrier.arrive.shared.b64 [shMem2];\n...\nmbarrier.arrive.shared.b64 state, [shMem2];\n\nwaitLoop:\nmbarrier.test_wait.shared::cta.b64 p, [shMem2], state;\n@!p bra waitLoop;\n\n// Example 4, Synchronizing the CTA0 threads with cluster threads\n.reg .b64 %r1, addr, remAddr;\n.shared .b64 shMem;\n\ncvta.shared.u64          addr, shMem;\nmapa.u64                 remAddr, addr, 0;     // CTA0\u2019s shMem instance\n\n// One thread from CTA0 executing the below initialization operation\n@p0 mbarrier.init.shared::cta.b64 [shMem], N;  // N = no of cluster threads\n\nbarrier.cluster.arrive;\nbarrier.cluster.wait;\n\n// Entire cluster executing the below arrive operation\nmbarrier.arrive.release.cluster.b64              _, [remAddr];\n\n// computation not requiring mbarrier synchronization ...\n\n// Only CTA0 threads executing the below wait operation\nwaitLoop:\nmbarrier.try_wait.parity.acquire.cluser.shared::cta.b64  complete, [shMem], 0;\n@!complete bra waitLoop;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: mbarrier\n\n\n\nSynchronizing any subset of threads within a CTA\n\nOne-way synchronization of threads across CTAs of a cluster. As noted in mbarrier support with\n\nshared memory, threads can\n\nperform only arrive operations but not *_wait on an mbarrier located in shared::cluster\n\nspace.\n\nWaiting for completion of asynchronous memory operations initiated by a thread and making them\n\nvisible to other threads.\n\nAn mbarrier o...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.arrive\n\n\n\nPerforms arrive-on operation on the\n\nmbarrier object.\n\nSyntax\n\nmbarrier.arrive{.sem}{.scope}{.shared{::cta}}.b64           state, [addr]{, count};\n\nmbarrier.arrive{.sem}{.scope}{.shared::cluster}.b64         _, [addr] {,count}\n\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared{::cta}}.b64 state, [addr], txCount;\n\nmbarrier.arrive.expect_tx{.sem}{.scope}{.shared::cluster}.b64   _, [addr], txCount;\n\nmbarrier....\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.arrive_drop\n\n\n\nDecrements the expected count of the mbarrier object and performs arrive-on operation.\n\nSyntax\n\nmbarrier.arrive_drop{.sem}{.scope}{.shared{::cta}}.b64 state,           [addr]{, count};\n\nmbarrier.arrive_drop{.sem}{.scope}{.shared::cluster}.b64           _,   [addr] {,count};\n\nmbarrier.arrive_drop.expect_tx{.shared{::cta}}{.sem}{.scope}.b64 state, [addr], tx_count;\n\nmbarrier.arrive_drop.expect_tx{.shared...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.complete_tx\n\n\n\nPerfoms complete-tx\n\noperation on the mbarrier object.\n\nSyntax\n\nmbarrier.complete_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n\n.scope = { .cta, .cluster }\n\n.space = { .shared{::cta}, .shared::cluster }\n\nDescription\n\nA thread executing mbarrier.complete_tx performs a complete-tx\n\noperation on the mbarrier object at the location specified by the address operand addr. The\n\n32-bit unsig...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.expect_tx\n\n\n\nPerfoms expect-tx operation on the mbarrier object.\n\nSyntax\n\nmbarrier.expect_tx{.sem}{.scope}{.space}.b64 [addr], txCount;\n\n.sem   = { .relaxed }\n\n.scope = { .cta, .cluster }\n\n.space = { .shared{::cta}, .shared::cluster }\n\nDescription\n\nA thread executing mbarrier.expect_tx performs an expect-tx\n\noperation on the mbarrier object at the location specified by the address operand addr. The\n\n32-bit unsigned int...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.init\n\n\n\nInitialize the mbarrier object.\n\nSyntax\n\nmbarrier.init{.shared{::cta}}.b64 [addr], count;\n\nDescription\n\nmbarrier.init initializes the mbarrier object at the location specified by the address operand\n\naddr with the unsigned 32-bit integer count. The value of operand count must be in the range\n\nas specified in Contents of the mbarrier object.\n\nInitialization of the mbarrier object involves :\n\nInitializing t...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.inval\n\n\n\nInvalidates the mbarrier object.\n\nSyntax\n\nmbarrier.inval{.shared{::cta}}.b64 [addr];\n\nDescription\n\nmbarrier.inval invalidates the mbarrier object at the location specified by the address\n\noperand addr.\n\nAn mbarrier object must be invalidated before using its memory location for any other purpose.\n\nPerforming any mbarrier operation except mbarrier.init on an invalidated mbarrier object\n\nresults in undefine...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.pending_count\n\n\n\nQuery the pending arrival count from the opaque mbarrier state.\n\nSyntax\n\nmbarrier.pending_count.b64 count, state;\n\nDescription\n\nThe pending count can be queried from the opaque mbarrier state using mbarrier.pending_count.\n\nThe state operand is a 64-bit register that must be the result of a prior\n\nmbarrier.arrive.noComplete or mbarrier.arrive_drop.noComplete instruction. Otherwise, the\n\nbehavior is undefi...\n\n=====Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait\n\n\n\nChecks whether the mbarrie ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier"
            };

        case "membar/fence":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence\" target=\"_blank\" rel=\"noopener noreferrer\">membar/fence <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: membar/fence</h1><section id=\"parallel-synchronization-and-communication-instructions-membar-fence\">\n<span id=\"parallel-synchronization-and-communication-instructions-membar\"></span>\n\n<p>Enforce an ordering of memory operations.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>fence{.sem}.scope;\nfence.op_restrict.release.cluster;\nfence.proxy.proxykind;\nmembar.level;\nmembar.proxy.proxykind;\n\n.sem       = { .sc, .acq_rel };\n.scope     = { .cta, .cluster, .gpu, .sys };\n.level     = { .cta, .gl, .sys };\n.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };\n.op_restrict = { .mbarrier_init };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> instruction guarantees that prior memory accesses requested by this thread (<code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> instructions) are performed at the specified <code class=\"docutils literal notranslate\"><span class=\"pre\">level</span></code>, before later\nmemory operations requested by this thread following the <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> instruction. The <code class=\"docutils literal notranslate\"><span class=\"pre\">level</span></code>\nqualifier specifies the set of threads that may observe the ordering effect of this operation.</p>\n<p>A memory read (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value read has been\ntransmitted from memory and cannot be modified by another thread at the indicated level. A memory\nwrite (e.g., by <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code>) has been performed when the value written has become\nvisible to other threads at the specified level, that is, when the previous value can no longer be\nread.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> instruction establishes an ordering between memory accesses requested by this thread\n(<code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> instructions) as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. The scope qualifier specifies the set of threads that may\nobserve the ordering effect of this operation.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.acq_rel</span></code> is a light-weight fence that is sufficient for memory synchronization in most\nprograms. Instances of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.acq_rel</span></code> synchronize when combined with additional memory operations\nas described in <code class=\"docutils literal notranslate\"><span class=\"pre\">acquire</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">release</span></code> patterns in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent, <code class=\"docutils literal notranslate\"><span class=\"pre\">.acq_rel</span></code>\nis assumed by default.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code> is a slower fence that can restore <em>sequential consistency</em> when used in sufficient\nplaces, at the cost of performance. Instances of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code> with sufficient scope always\nsynchronize by forming a total order per scope, determined at runtime. This total order can be\nconstrained further by other synchronization in the program.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> restricts the class of prior memory operations for which the <code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code>\ninstruction provides the memory ordering guarantees. When <code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.mbarrier_init</span></code>,\nthe fence only applies to the prior <code class=\"docutils literal notranslate\"><span class=\"pre\">mbarrier.init</span></code> operations executed by the same thread on\n<em>mbarrier objects</em> in <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cta</span></code> state space.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> indicates memory synchronization as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency\nModel</a>.</p>\n<p>On <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> is a synonym for <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code><sup>1</sup>, and the <code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code>\nlevels <code class=\"docutils literal notranslate\"><span class=\"pre\">cta</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gl</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sys</span></code> are synonymous with the <code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> scopes <code class=\"docutils literal notranslate\"><span class=\"pre\">cta</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gpu</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sys</span></code> respectively.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code> instructions establish an ordering between memory accesses that\nmay happen through different <em>proxies</em>. The type of <em>proxy</em> is indicated using the <code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code>\nqualifier. Value <code class=\"docutils literal notranslate\"><span class=\"pre\">.alias</span></code> of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code> qualifier refers to memory accesses performed\nusing virtually aliased addresses to the same memory location. Value <code class=\"docutils literal notranslate\"><span class=\"pre\">.async</span></code> of the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.proxykind</span></code> qualifier specifies that the memory ordering is established between the async proxy\nand the generic proxy. The memory ordering is limited only to the state space specified. If no state\nspace is specified, then the memory ordering applies on all state spaces.</p>\n<p>On <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> and higher, <code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> is a synonym for <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code>.</p>\n<p><sup>1</sup> The semantics of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code> introduced with <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> is a superset of the semantics of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> and the two are compatible; when executing on <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or later architectures,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membar</span></code> acquires the full semantics of <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.sc</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.{cta,gl}</span></code> introduced in PTX ISA version 1.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.sys</span></code> introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code> introduced in PTX ISA version 7.5.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> qualifier introduced in PTX ISA version 8.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy.async</span></code> is introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.{cta,gl}</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.sys</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">membar.proxy</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.op_restrict</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">fence.proxy.async</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>membar.gl;\nmembar.cta;\nmembar.sys;\nfence.sc;\nfence.sc.cluster;\nfence.proxy.alias;\nmembar.proxy.alias;\nfence.mbarrier_init.release.cluster;\nfence.proxy.async;\nfence.proxy.async.shared::cta;\nfence.proxy.async.shared::cluster;\nfence.proxy.async.global;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Enforce an ordering of memory operations.\n\nSyntax\n\nfence{.sem}.scope;\n\nfence.op_restrict.release.cluster;\n\nfence.proxy.proxykind;\n\nmembar.level;\n\nmembar.proxy.proxykind;\n\n.sem       = { .sc, .acq_rel };\n\n.scope     = { .cta, .cluster, .gpu, .sys };\n\n.level     = { .cta, .gl, .sys };\n\n.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };\n\n.op_restrict = { .mbarrier_init };\n\nDescription\n\nThe membar instruction guarantees that prior memory accesses requested by this thread (ld,\n\nst, atom and red instructions) are performed at the specified level, before later\n\nmemory operations requested by this thread following the membar instruction. The level\n\nqualifier specifies the set of threads that may observe the ordering effect of this operation.\n\nA memory read (e.g., by ld or atom) has been performed when the value read has been\n\ntransmitted from memory and cannot be modified by another thread at the indicated level. A memory\n\nwrite (e.g., by st, red or atom) has been performed when the value written has become\n\nvisible to other threads at the specified level, that is, when the previous value can no longer be\n\nread.\n\nThe fence instruction establishes an ordering between memory accesses requested by this thread\n\n(ld, st, atom and red instructions) as described in the Memory Consistency Model. The scope qualifier specifies the set of threads that may\n\nobserve the ordering effect of this operation.\n\nfence.acq_rel is a light-weight fence that is sufficient for memory synchronization in most\n\nprograms. Instances of fence.acq_rel synchronize when combined with additional memory operations\n\nas described in acquire and release patterns in the Memory Consistency Model. If the optional .sem qualifier is absent, .acq_rel\n\nis assumed by default.\n\nfence.sc is a slower fence that can restore sequential consistency when used in sufficient\n\nplaces, at the cost of performance. Instances of fence.sc with sufficient scope always\n\nsynchronize by forming a total order per scope, determined at runtime. This total order can be\n\nconstrained further by other synchronization in the program.\n\nQualifier .op_restrict restricts the class of prior memory operations for which the fence\n\ninstruction provides the memory ordering guarantees. When .op_restrict is .mbarrier_init,\n\nthe fence only applies to the prior mbarrier.init operations executed by the same thread on\n\nmbarrier objects in .shared::cta state space.\n\nQualifier .release indicates memory synchronization as described in the Memory Consistency\n\nModel.\n\nOn sm_70 and higher membar is a synonym for fence.sc1, and the membar\n\nlevels cta, gl and sys are synonymous with the fence scopes cta, gpu and\n\nsys respectively.\n\nmembar.proxy and fence.proxy instructions establish an ordering between memory accesses that\n\nmay happen through different proxies. The type of proxy is indicated using the .proxykind\n\nqualifier. Value .alias of the .proxykind qualifier refers to memory accesses performed\n\nusing virtually aliased addresses to the same memory location. Value .async of the\n\n.proxykind qualifier specifies that the memory ordering is established between the async proxy\n\nand the generic proxy. The memory ordering is limited only to the state space specified. If no state\n\nspace is specified, then the memory ordering applies on all state spaces.\n\nOn sm_70 and higher, membar.proxy is a synonym for fence.proxy.\n\n1 The semantics of fence.sc introduced with sm_70 is a superset of the semantics of\n\nmembar and the two are compatible; when executing on sm_70 or later architectures,\n\nmembar acquires the full semantics of fence.sc.\n\nPTX ISA Notes\n\nmembar.{cta,gl} introduced in PTX ISA version 1.4.\n\nmembar.sys introduced in PTX ISA version 2.0.\n\nfence introduced in PTX ISA version 6.0.\n\nmembar.proxy and fence.proxy introduced in PTX ISA version 7.5.\n\n.cluster scope qualifier introduced in PTX ISA version 7.8.\n\n.op_restrict qualifier introduced in PTX ISA version 8.0.\n\nfence.proxy.async is introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nmembar.{cta,gl} supported on ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence"
            };

        case "min":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min\" target=\"_blank\" rel=\"noopener noreferrer\">min(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min\" target=\"_blank\" rel=\"noopener noreferrer\">min(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-min\" target=\"_blank\" rel=\"noopener noreferrer\">min(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: min</h1><section id=\"floating-point-instructions-min\">\n\n\n<p>Find the minimum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>min{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\nmin.f64                            d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the minimum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, then the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the minimum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (.xorsign) {\n    xorsign = getSignBit(a) ^ getSignBit(b);\n    if (.abs) {\n        a = |a|;\n        b = |b|;\n   }\n}\nif (isNaN(a) &amp;&amp; isNaN(b))                 d = NaN;\nelse if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))  d = NaN;\nelse if (isNaN(a))                        d = b;\nelse if (isNaN(b))                        d = a;\nelse                                      d = (a &lt; b) ? a : b;\nif (.xorsign &amp;&amp; !isNaN(d)) {\n    setSignBit(d, xorsign);\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.NaN</span></code>introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign.abs</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.NaN</span></code>requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign.abs</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  min.ftz.f32  z,z,x;\n    min.f64      a,b,c;\n    // fp32 min with .NaN\n    min.NaN.f32  f0,f1,f2;\n    // fp32 min with .xorsign.abs\n    min.xorsign.abs.f32 Rd, Ra, Rb;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: min</h1><section id=\"half-precision-floating-point-instructions-min\">\n\n\n<p>Find the minimum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>min{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\nmin{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\nmin{.NaN}{.xorsign.abs}.bf16           d, a, b;\nmin{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the minimum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction types, input vectors are formed with half-word values\nfrom source operands. Half-word operands are then processed in parallel to store <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.NaN</span></code> modifier is specified, then the result is canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> if either of the inputs is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifier is specified, the magnitude of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the minimum of\nabsolute values of both the input arguments.</p>\n<p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier is specified, the sign bit of destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is equal to the XOR of the\nsign bits of both the inputs.</p>\n<p>Modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> must be specified together and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> considers the sign\nbit of both inputs before applying <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> operation.</p>\n<p>If the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> then the <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.abs</span></code> modifiers will be ignored.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    if (.xorsign) {\n        xorsign = getSignBit(a) ^ getSignBit(b);\n        if (.abs) {\n            a = |a|;\n            b = |b|;\n        }\n    }\n    if (isNaN(a) &amp;&amp; isNaN(b))              d = NaN;\n    if (.NaN &amp;&amp; (isNaN(a) || isNaN(b)))    d = NaN;\n    else if (isNaN(a))                     d = b;\n    else if (isNaN(b))                     d = a;\n    else                                   d = (a &lt; b) ? a : b;\n    if (.xorsign &amp;&amp; !isNaN(d)) {\n         setSignBit(d, xorsign);\n    }\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n        if (.xorsign) {\n            xorsign = getSignBit(fA[i]) ^ getSignBit(fB[i]);\n            if (.abs) {\n               fA[i] = |fA[i]|;\n               fB[i] = |fB[i]|;\n           }\n        }\n        if (isNaN(fA[i]) &amp;&amp; isNaN(fB[i]))              d[i] = NaN;\n        if (.NaN &amp;&amp; (isNaN(fA[i]) || isNaN(fB[i])))    d[i] = NaN;\n        else if (isNaN(fA[i]))                         d[i] = fB[i];\n        else if (isNaN(fB[i]))                         d[i] = fA[i];\n        else                                           d[i] = (fA[i] &lt; fB[i]) ? fA[i] : fB[i];\n        if (.xorsign &amp;&amp; !isNaN(d[i])) {\n            setSignBit(d[i], xorsign);\n        }\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">min.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>If values of both inputs are 0.0, then +0.0 &gt; -0.0.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign</span></code> introduced in PTX ISA version 7.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.xorsign.abs</span></code> support requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>min.ftz.f16       h0,h1,h2;\nmin.f16x2         b0,b1,b2;\n// SIMD fp16 min with .NaN\nmin.NaN.f16x2     b0,b1,b2;\nmin.bf16          h0, h1, h2;\n// SIMD bf16 min with NaN\nmin.NaN.bf16x2    b0, b1, b2;\n// scalar bf16 min with xorsign.abs\nmin.xorsign.abs.bf16 Rd, Ra, Rb\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: min</h1><section id=\"integer-arithmetic-instructions-min\">\n\n\n<p>Find the minimum of two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>min.atype         d, a, b;\nmin{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n           .u16x2, .s16, .s64 };\n.btype = { .s16x2, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the minimum of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> instruction types, forms input vectors by half word values from source\noperands. Half-word operands are then processed in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code> result\nin destination.</p>\n<p>Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have the same type as the instruction type. For instruction types\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s16x2</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == u16x2 || type == s16x2) {\n    iA[0] = a[0:15];\n    iA[1] = a[16:31];\n    iB[0] = b[0:15];\n    iB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = (iA[i] &lt; iB[i]) ? iA[i] : iB[i];\n    }\n} else {\n    d = (a &lt; b) ? a : b; // Integer (signed and unsigned)\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Signed and unsigned differ.</p>\n<dl class=\"simple\">\n<dt>Saturation modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.relu.{s16x2,</span> <span class=\"pre\">s32}</span></code> clamps the result to 0 if negative.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">min{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">min.relu.s32</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">min.u16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">min{.relu}.s16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">min.relu.s32</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    min.s32  r0,a,b;\n@p  min.u16  h,i,j;\n    min.s16x2.relu u,v,w;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f32  d, a, b;\n\nmin.f64                            d, a, b;\n\nDescription\n\nStore the minimum of a and b in d.\n\nIf .NaN modifier is specified, then the result is canonical NaN if either of the inputs is\n\nNaN.\n\nIf .abs modifier is specified, the magnitude of destination operand d is the minimum of\n\nabsolute values of both the input argument...\n\n=====Half Precision Floating Point Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f16      d, a, b;\n\nmin{.ftz}{.NaN}{.xorsign.abs}.f16x2    d, a, b;\n\nmin{.NaN}{.xorsign.abs}.bf16           d, a, b;\n\nmin{.NaN}{.xorsign.abs}.bf16x2         d, a, b;\n\nDescription\n\nStore the minimum of a and b in d.\n\nFor .f16x2 and .bf16x2 instruction types, input vectors are formed with half-word values\n\nfrom source operands. Half-word o...\n\n=====Integer Arithmetic Instructions: min\n\n\n\nFind the minimum of two values.\n\nSyntax\n\nmin.atype         d, a, b;\n\nmin{.relu}.btype  d, a, b;\n\n.atype = { .u16, .u32, .u64,\n\n           .u16x2, .s16, .s64 };\n\n.btype = { .s16x2, .s32 };\n\nDescription\n\nStore the minimum of a and b in d.\n\nFor .u16x2, .s16x2 instruction types, forms input vectors by half word values from source\n\noperands. Half-word operands are then processed in parallel to produce .u16x2, .s... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-min"
            };

        case "minnctapersm":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-minnctapersm\" target=\"_blank\" rel=\"noopener noreferrer\">minnctapersm <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .minnctapersm</h1><section id=\"performance-tuning-directives-minnctapersm\">\n\n\n<p>Minimum number of CTAs per SM.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.minnctapersm ncta\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the minimum number of CTAs from the kernel\u2019s grid to be mapped to a single multiprocessor\n(SM).</p>\n<p><strong>Notes</strong></p>\n<p>Optimizations based on <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code> need either <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code> to be specified as\nwell.</p>\n<p>If the total number of threads on a single SM resulting from <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> /\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code> exceed maximum number of threads supported by an SM then directive <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code>\nwill be ignored.</p>\n<p>In PTX ISA version 2.1 or higher, a warning is generated if <code class=\"docutils literal notranslate\"><span class=\"pre\">.minnctapersm</span></code> is specified without\nspecifying either <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0 as a replacement for <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxnctapersm</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo .maxntid 256 .minnctapersm 4 { ... }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Minimum number of CTAs per SM.\n\nSyntax\n\n.minnctapersm ncta\n\nDescription\n\nDeclare the minimum number of CTAs from the kernel\u2019s grid to be mapped to a single multiprocessor\n\n(SM).\n\nNotes\n\nOptimizations based on .minnctapersm need either .maxntid or .reqntid to be specified as\n\nwell.\n\nIf the total number of threads on a single SM resulting from .minnctapersm and .maxntid /\n\n.reqntid exceed maximum number of threads supported by an SM then directive .minnctapersm\n\nwill be ignored.\n\nIn PTX ISA version 2.1 or higher, a warning is generated if .minnctapersm is specified without\n\nspecifying either .maxntid or .reqntid.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0 as a replacement for .maxnctapersm.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .maxntid 256 .minnctapersm 4 { ... }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-minnctapersm"
            };

        case "mov":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov\" target=\"_blank\" rel=\"noopener noreferrer\">mov <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2\" target=\"_blank\" rel=\"noopener noreferrer\">mov <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: mov</h1><section id=\"data-movement-and-conversion-instructions-mov\">\n\n\n<p>Set a register variable with the value of a register variable or an immediate value. Take the\nnon-generic address of a variable in global, local, or shared state space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.type  d, a;\nmov.type  d, sreg;\nmov.type  d, avar;       // get address of variable\nmov.type  d, avar+imm;   // get address of variable with offset\nmov.u32   d, fname;      // get address of device function\nmov.u64   d, fname;      // get address of device function\nmov.u32   d, kernel;     // get address of entry function\nmov.u64   d, kernel;     // get address of entry function\n\n.type = { .pred,\n          .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64,\n                .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> may be a register, special register, variable with optional offset in an addressable\nmemory space, or function name.</p>\n<p>For variables declared in <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.local</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state spaces, <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code>\nplaces the non-generic address of the variable (i.e., the address of the variable in its state\nspace) into the destination register. The generic address of a variable in <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> state space may be generated by first taking the address within the state\nspace with <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and then converting it to a generic address using the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> instruction;\nalternately, the generic address of a variable declared in <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> state space may be taken directly using the <code class=\"docutils literal notranslate\"><span class=\"pre\">cvta</span></code> instruction.</p>\n<p>Note that if the address of a device function parameter is moved to a register, the parameter will\nbe copied onto the stack and the address will be in the local state space.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a;\nd = sreg;\nd = &amp;avar;        // address is non-generic; i.e., within the variable's declared state space\nd = &amp;avar+imm;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<ul class=\"simple\">\n<li><p>Although only predicate and bit-size types are required, we include the arithmetic types for the\nprogrammer\u2019s convenience: their use enhances program readability and allows additional type\nchecking.</p></li>\n<li><p>When moving address of a kernel or a device function, only <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> instruction types\nare allowed. However, if a signed type is used, it is not treated as a compilation error. The\ncompiler issues a warning in this case.</p></li>\n</ul>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Taking the address of kernel entry functions requires PTX ISA version 3.1 or later. Kernel function\naddresses should only be used in the context of CUDA Dynamic Parallelism system calls. See the <em>CUDA\nDynamic Parallelism Programming Guide</em> for details.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mov.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Taking the address of kernel entry functions requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_35</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.f32  d,a;\nmov.u16  u,v;\nmov.f32  k,0.1;\nmov.u32  ptr, A;        // move address of A into ptr\nmov.u32  ptr, A[5];     // move address of A[5] into ptr\nmov.u32  ptr, A+20;     // move address with offset into ptr\nmov.u32  addr, myFunc;  // get address of device function 'myFunc'\nmov.u64  kptr, main;    // get address of entry function 'main'\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: mov</h1><section id=\"data-movement-and-conversion-instructions-mov-2\">\n<span id=\"id5\"></span>\n\n<p>Move vector-to-scalar (pack) or scalar-to-vector (unpack).</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.type  d, a;\n\n.type = { .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Write scalar register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the packed value of vector register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, or write vector register\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> with the unpacked values from scalar register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a vector register, the sink symbol <code class=\"docutils literal notranslate\"><span class=\"pre\">'_'</span></code> may be used for one or\nmore elements provided that at least one element is a scalar register.</p>\n<p>For bit-size types, <code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> may be used to pack vector elements into a scalar register or unpack\nsub-fields of a scalar register into a vector. Both the overall size of the vector and the size of\nthe scalar must match the size of the instruction type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// pack two 8-bit elements into .b16\nd = a.x | (a.y &lt;&lt; 8)\n// pack four 8-bit elements into .b32\nd = a.x | (a.y &lt;&lt; 8)  | (a.z &lt;&lt; 16) | (a.w &lt;&lt; 24)\n// pack two 16-bit elements into .b32\nd = a.x | (a.y &lt;&lt; 16)\n// pack four 16-bit elements into .b64\nd = a.x | (a.y &lt;&lt; 16)  | (a.z &lt;&lt; 32) | (a.w &lt;&lt; 48)\n// pack two 32-bit elements into .b64\nd = a.x | (a.y &lt;&lt; 32)\n\n// unpack 8-bit elements from .b16\n{ d.x, d.y } = { a[0..7], a[8..15] }\n// unpack 8-bit elements from .b32\n{ d.x, d.y, d.z, d.w }\n        { a[0..7], a[8..15], a[16..23], a[24..31] }\n\n// unpack 16-bit elements from .b32\n{ d.x, d.y }  = { a[0..15], a[16..31] }\n// unpack 16-bit elements from .b64\n{ d.x, d.y, d.z, d.w } =\n        { a[0..15], a[16..31], a[32..47], a[48..63] }\n\n// unpack 32-bit elements from .b64\n{ d.x, d.y } = { a[0..31], a[32..63] }\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.b32 %r1,{a,b};      // a,b have type .u16\nmov.b64 {lo,hi}, %x;    // %x is a double; lo,hi are .u32\nmov.b32 %r1,{x,y,z,w};  // x,y,z,w have type .b8\nmov.b32 {r,g,b,a},%r1;  // r,g,b,a have type .u8\nmov.b64 {%r1, _}, %x;   // %x is.b64, %r1 is .b32\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: mov\n\n\n\nSet a register variable with the value of a register variable or an immediate value. Take the\n\nnon-generic address of a variable in global, local, or shared state space.\n\nSyntax\n\nmov.type  d, a;\n\nmov.type  d, sreg;\n\nmov.type  d, avar;       // get address of variable\n\nmov.type  d, avar+imm;   // get address of variable with offset\n\nmov.u32   d, fname;      // get address of device function\n\nmov.u64   d, f...\n\n=====Data Movement and Conversion Instructions: mov\n\n\n\nMove vector-to-scalar (pack) or scalar-to-vector (unpack).\n\nSyntax\n\nmov.type  d, a;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nWrite scalar register d with the packed value of vector register a, or write vector register\n\nd with the unpacked values from scalar register a.\n\nWhen destination operand d is a vector register, the sink symbol '_' may be used for one or\n\nmore elements provided that at least one... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov"
            };

        case "mul":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul\" target=\"_blank\" rel=\"noopener noreferrer\">mul(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-mul\" target=\"_blank\" rel=\"noopener noreferrer\">mul(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul\" target=\"_blank\" rel=\"noopener noreferrer\">mul(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: mul</h1><section id=\"floating-point-instructions-mul\">\n\n\n<p>Multiply two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul{.rnd}{.ftz}{.sat}.f32  d, a, b;\nmul{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two values.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a * b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>For floating-point multiplication, all operands must be the same size.</p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that a <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul/add</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul/sub</span></code> sequences with no rounding modifiers may be\noptimized to use fused-multiply-add instructions on the target device.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.sat.f32</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>available for all targets</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">mul.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n</dd>\n</dl>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul.ftz.f32 circumf,radius,pi  // a single-precision multiply\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: mul</h1><section id=\"half-precision-floating-point-instructions-mul\">\n\n\n<p>Multiply two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul{.rnd}{.ftz}{.sat}.f16   d, a, b;\nmul{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nmul{.rnd}.bf16   d, a, b;\nmul{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs multiplication and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then multiplied in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\nresult in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>\ninstruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a * b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] * fB[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that a <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul/sub</span></code> sequences with no rounding modifiers may\nbe optimized to use fused-multiply-add instructions on the target device.</p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mul.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16x2</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mul{.rnd}.bf16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// scalar f16 multiplications\nmul.f16        d0, a0, b0;\nmul.rn.f16     d1, a1, b1;\nmul.bf16       bd0, ba0, bb0;\nmul.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 multiplication\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nmul.f16x2  p3, p1, p2;   // SIMD f16x2 multiplication\n\n// SIMD bf16 multiplication\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nmul.bf16x2  p6, p4, p5;       // SIMD bf16x2 multiplication\n\n// SIMD fp16 multiplication\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nmul.f16x2       f2, f0, f1;     // SIMD f16x2 multiplication\n\n// SIMD bf16 multiplication\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nmul.bf16x2      f5, f3, f4;      // SIMD bf16x2 multiplication\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: mul</h1><section id=\"integer-arithmetic-instructions-mul\">\n\n\n<p>Multiply two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul.mode.type  d, a, b;\n\n.mode = { .hi, .lo, .wide };\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two values.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = a * b;\nn = bitwidth of type;\nd = t;            // for .wide\nd = t&lt;2n-1..n&gt;;   // for .hi variant\nd = t&lt;n-1..0&gt;;    // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The type of the operation represents the types of the <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operands. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.hi</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.lo</span></code> is specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is the same size as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, and either the upper or lower\nhalf of the result is written to the destination register. If <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> is specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is\ntwice as wide as <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to receive the full result of the multiplication.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.wide</span></code> suffix is supported only for 16- and 32-bit integer types.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul.wide.s16 fa,fxs,fys;   // 16*16 bits yields 32 bits\nmul.lo.s16 fa,fxs,fys;     // 16*16 bits, save only the low 16 bits\nmul.wide.s32 z,x,y;        // 32*32 bits, creates 64 bit result\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul{.rnd}{.ftz}{.sat}.f32  d, a, b;\n\nmul{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute the product of two values.\n\nSemantics\n\nd = a * b;\n\nNotes\n\nFor floating-point multiplication, all operands must be the same size.\n\nRounding modifiers:\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds toward...\n\n=====Half Precision Floating Point Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul{.rnd}{.ftz}{.sat}.f16   d, a, b;\n\nmul{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nmul{.rnd}.bf16   d, a, b;\n\nmul{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms multiplication and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then mul...\n\n=====Integer Arithmetic Instructions: mul\n\n\n\nMultiply two values.\n\nSyntax\n\nmul.mode.type  d, a, b;\n\n.mode = { .hi, .lo, .wide };\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nCompute the product of two values.\n\nSemantics\n\nt = a * b;\n\nn = bitwidth of type;\n\nd = t;            // for .wide\n\nd = t<2n-1..n>;   // for .hi variant\n\nd = t<n-1..0>;    // for .lo variant\n\nNotes\n\nThe type of the operation represents the types of the a and ... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-mul"
            };

        case "mul24":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul24\" target=\"_blank\" rel=\"noopener noreferrer\">mul24(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: mul24</h1><section id=\"integer-arithmetic-instructions-mul24\">\n\n\n<p>Multiply two 24-bit integer values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul24.mode.type  d, a, b;\n\n.mode = { .hi, .lo };\n.type = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the product of two 24-bit integer values held in 32-bit source registers, and return either\nthe high or low 32-bits of the 48-bit result.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = a * b;\nd = t&lt;47..16&gt;;    // for .hi variant\nd = t&lt;31..0&gt;;     // for .lo variant\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Integer multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul24.hi</span></code> performs a 24x24-bit multiply and returns the high 32 bits of the 48-bit result.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul24.lo</span></code> performs a 24x24-bit multiply and returns the low 32 bits of the 48-bit result.</p>\n<p>All operands are of the same type and size.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">mul24.hi</span></code> may be less efficient on machines without hardware support for 24-bit multiply.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mul24.lo.s32 d,a,b;   // low 32-bits of 24x24-bit signed multiply.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Multiply two 24-bit integer values.\n\nSyntax\n\nmul24.mode.type  d, a, b;\n\n.mode = { .hi, .lo };\n\n.type = { .u32, .s32 };\n\nDescription\n\nCompute the product of two 24-bit integer values held in 32-bit source registers, and return either\n\nthe high or low 32-bits of the 48-bit result.\n\nSemantics\n\nt = a * b;\n\nd = t<47..16>;    // for .hi variant\n\nd = t<31..0>;     // for .lo variant\n\nNotes\n\nInteger multiplication yields a result that is twice the size of the input operands, i.e., 48-bits.\n\nmul24.hi performs a 24x24-bit multiply and returns the high 32 bits of the 48-bit result.\n\nmul24.lo performs a 24x24-bit multiply and returns the low 32 bits of the 48-bit result.\n\nAll operands are of the same type and size.\n\nmul24.hi may be less efficient on machines without hardware support for 24-bit multiply.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmul24.lo.s32 d,a,b;   // low 32-bits of 24x24-bit signed multiply.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-mul24"
            };

        case "multimem":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\" target=\"_blank\" rel=\"noopener noreferrer\">multimem.ld_reduce <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\" target=\"_blank\" rel=\"noopener noreferrer\">multimem.red <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\" target=\"_blank\" rel=\"noopener noreferrer\">multimem.st <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red</h1><section id=\"data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\">\n<span id=\"data-movement-and-conversion-instructions-multimem\"></span>\n\n<p>Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code> or any other memory operations results in undefined behavior.</p>\n<p>Refer to <em>CUDA programming guide</em> for creation and management of the multimem addresses.</p>\n<p><strong>multimem.ld_reduce, multimem.st, multimem.red</strong></p>\n<p>Perform memory operations on the multimem address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs the following operations:</p>\n<ul class=\"simple\">\n<li><p>load operation on the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>reduction operation specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> on the multiple data loaded from the multimem address\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n</ul>\n<p>The result of the reduction operation in returned in register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> performs a store operation of the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to all the memory\nlocations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> performs a reduction operation on all the memory locations pointed to\nby the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> perform reduction\non all the memory locations that the multimem address points to.</p>\n<p>Address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be a multimem address. Otherwise, the behavior is undefined.  Supported\naddressing modes for operand a and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace then the behavior is undefined.</p>\n<p>For floating-point type multi- operations, the size of the specified type along with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> and type are\nallowed. Type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> cannot be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> qualifier.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> and base type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 62%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>op</p></th>\n<th class=\"head\"><p>Base type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><div class=\"line-block\">\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code></div>\n</div>\n</td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><div class=\"line-block\">\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s644</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code></div>\n</div>\n</td>\n</tr>\n</tbody>\n</table>\n<p>Optional qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.ldsem</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.stsem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.redsem</span></code> specify the memory synchronizing effect\nof the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> respectively, as described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If explicit semantics qualifiers\nare not specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> default to <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> then <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red</h1><section id=\"data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\">\n<span id=\"data-movement-and-conversion-instructions-multimem\"></span>\n\n<p>Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code> or any other memory operations results in undefined behavior.</p>\n<p>Refer to <em>CUDA programming guide</em> for creation and management of the multimem addresses.</p>\n<p><strong>multimem.ld_reduce, multimem.st, multimem.red</strong></p>\n<p>Perform memory operations on the multimem address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs the following operations:</p>\n<ul class=\"simple\">\n<li><p>load operation on the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>reduction operation specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> on the multiple data loaded from the multimem address\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n</ul>\n<p>The result of the reduction operation in returned in register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> performs a store operation of the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to all the memory\nlocations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> performs a reduction operation on all the memory locations pointed to\nby the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> perform reduction\non all the memory locations that the multimem address points to.</p>\n<p>Address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be a multimem address. Otherwise, the behavior is undefined.  Supported\naddressing modes for operand a and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace then the behavior is undefined.</p>\n<p>For floating-point type multi- operations, the size of the specified type along with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> and type are\nallowed. Type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> cannot be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> qualifier.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> and base type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 62%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>op</p></th>\n<th class=\"head\"><p>Base type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><div class=\"line-block\">\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code></div>\n</div>\n</td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><div class=\"line-block\">\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s644</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code></div>\n</div>\n</td>\n</tr>\n</tbody>\n</table>\n<p>Optional qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.ldsem</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.stsem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.redsem</span></code> specify the memory synchronizing effect\nof the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> respectively, as described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If explicit semantics qualifiers\nare not specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> default to <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> then <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: multimem.ld_reduce, multimem.st, multimem.red</h1><section id=\"data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red\">\n<span id=\"data-movement-and-conversion-instructions-multimem\"></span>\n\n<p>Multimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\nwith <code class=\"docutils literal notranslate\"><span class=\"pre\">ld</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">st</span></code> or any other memory operations results in undefined behavior.</p>\n<p>Refer to <em>CUDA programming guide</em> for creation and management of the multimem addresses.</p>\n<p><strong>multimem.ld_reduce, multimem.st, multimem.red</strong></p>\n<p>Perform memory operations on the multimem address.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add, .and, .or, .xor }\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n.ldsem =    { .weak, .relaxed, .acquire }\n.stsem =    { .weak, .relaxed, .release }\n.redsem =   { .relaxed, .release }\n.scope =    { .cta, .cluster, .gpu, .sys }\n.op  =      { .min, .max, .add }\n.redop  =   { .add }\n.vec =      { .v2, .v4, .v8 }\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs the following operations:</p>\n<ul class=\"simple\">\n<li><p>load operation on the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, which involves loading of data from all of the\nmultiple memory locations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>reduction operation specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> on the multiple data loaded from the multimem address\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n</ul>\n<p>The result of the reduction operation in returned in register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> performs a store operation of the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to all the memory\nlocations pointed to by the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> performs a reduction operation on all the memory locations pointed to\nby the multimem address <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> performs reduction on the values loaded from all the memory\nlocations that the multimem address points to. In contrast, the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> perform reduction\non all the memory locations that the multimem address points to.</p>\n<p>Address operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> must be a multimem address. Otherwise, the behavior is undefined.  Supported\naddressing modes for operand a and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>If no state space is specified then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is\nused. If the address specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> does not fall within the address window of <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state\nspace then the behavior is undefined.</p>\n<p>For floating-point type multi- operations, the size of the specified type along with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> must\nequal either 32-bits or 64-bits or 128-bits. No other combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> and type are\nallowed. Type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> cannot be used with <code class=\"docutils literal notranslate\"><span class=\"pre\">.vec</span></code> qualifier.</p>\n<p>The following table describes the valid combinations of <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> and base type:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 38%\"/>\n<col style=\"width: 62%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>op</p></th>\n<th class=\"head\"><p>Base type</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n<td><div class=\"line-block\">\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code></div>\n</div>\n</td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><div class=\"line-block\">\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s644</span></code></div>\n<div class=\"line\"><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code></div>\n</div>\n</td>\n</tr>\n</tbody>\n</table>\n<p>Optional qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.ldsem</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.stsem</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.redsem</span></code> specify the memory synchronizing effect\nof the <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> respectively, as described in\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If explicit semantics qualifiers\nare not specified, then <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.ld_reduce</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.st</span></code> default to <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is not specified for\n<code class=\"docutils literal notranslate\"><span class=\"pre\">multimem.red</span></code> then <code class=\"docutils literal notranslate\"><span class=\"pre\">.sys</span></code> scope is assumed by default.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>multimem.ld_reduce.and.b32                    val1_b32, [addr1];\nmultimem.ld_reduce.acquire.gpu.global.add.u32 val2_u32, [addr2];\n\nmultimem.st.relaxed.gpu.b32                [addr3], val3_b32;\nmultimem.st.release.cta.global.u32         [addr4], val4_u32;\n\nmultimem.red.relaxed.gpu.max.f64           [addr5], val5_f64;\nmultimem.red.release.cta.global.add.v4.f32 [addr6], {val6, val7, val8, val9};\n</pre></div>\n</div>\n</section>",
                "tooltip": "locations which the multimem address points to.\n\nMultimem addresses can only be accessed only by multimem.* operations. Accessing a multimem address\n\nwith ld, st or any other memory operations results in undefined behavior.\n\nRefer to CUDA programming guide for creation and management of the multimem addresses.\n\nmultimem.ld_reduce, multimem.st, multimem.red\n\nPerform memory operations on the multimem address.\n\nSyntax\n\n// Integer type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op.type      d, [a];\n\nmultimem.st{.stsem}{.scope}{.ss}.type                [a], b;\n\nmultimem.red{.redsem}{.scope}{.ss}.op.type           [a], b;\n\n.ss =       { .global }\n\n.ldsem =    { .weak, .relaxed, .acquire }\n\n.stsem =    { .weak, .relaxed, .release }\n\n.redsem =   { .relaxed, .release }\n\n.scope =    { .cta, .cluster, .gpu, .sys }\n\n.op  =      { .min, .max, .add, .and, .or, .xor }\n\n.type =     { .b32, .b64,  .u32, .u64, .s32, .s64 }\n\n// Floating point type:\n\nmultimem.ld_reduce{.ldsem}{.scope}{.ss}.op{.vec}.type    d, [a];\n\nmultimem.st{.stsem}{.scope}{.ss}{.vec}.type              [a], b;\n\nmultimem.red{.redsem}{.scope}{.ss}.redop{.vec}.type      [a], b;\n\n.ss =       { .global }\n\n.ldsem =    { .weak, .relaxed, .acquire }\n\n.stsem =    { .weak, .relaxed, .release }\n\n.redsem =   { .relaxed, .release }\n\n.scope =    { .cta, .cluster, .gpu, .sys }\n\n.op  =      { .min, .max, .add }\n\n.redop  =   { .add }\n\n.vec =      { .v2, .v4, .v8 }\n\n.type=      { .f16, .f16x2, .bf16, .bf16x2, .f32, .f64 }\n\nDescription\n\nInstruction multimem.ld_reduce performs the following operations:\n\nload operation on the multimem address a, which involves loading of data from all of the\n\nmultiple memory locations pointed to by the multimem address a,\n\nreduction operation specified by .op on the multiple data loaded from the multimem address\n\na.\n\nThe result of the reduction operation in returned in register d.\n\nInstruction multimem.st performs a store operation of the input operand b to all the memory\n\nlocations pointed to by the multimem address a.\n\nInstruction multimem.red performs a reduction operation on all the memory locations pointed to\n\nby the multimem address a, with operand b.\n\nInstruction multimem.ld_reduce performs reduction on the values loaded from all the memory\n\nlocations that the multimem address points to. In contrast, the multimem.red perform reduction\n\non all the memory locations that the multimem address points to.\n\nAddress operand a must be a multimem address. Otherwise, the behavior is undefined.  Supported\n\naddressing modes for operand a and alignment requirements are described in Addresses as Operands.\n\nIf no state space is specified then Generic Addressing is\n\nused. If the address specified by a does not fall within the address window of .global state\n\nspace then the behavior is undefined.\n\nFor floating-point type multi- operations, the size of the specified type along with .vec must\n\nequal either 32-bits or 64-bits or 128-bits. No other combinations of .vec and type are\n\nallowed. Type .f64 cannot be used with .vec qualifier.\n\nThe following table describes the valid combinations of .op and base type:\n\n\n\nop\n\nBase type\n\n.add\n\n.u32, .u64, .s32\n\n.f16, .f16x2, .bf16, .bf16x2\n\n.f32, .f64\n\n.and, .or, .xor\n\n.b32, .b64\n\n.min, .max\n\n.u32, .s32, .u64, .s644\n\n.f16, .f16x2, .bf16, .bf16x2\n\n\n\nOptional qualifiers .ldsem, .stsem and .redsem specify the memory synchronizing effect\n\nof the multimem.ld_reduce, multimem.st and multimem.red respectively, as described in\n\nMemory Consistency Model. If explicit semantics qualifiers\n\nare not specified, then multimem.ld_reduce and multimem.st default to .weak and\n\nmultimem.red defaults to .relaxed.\n\nThe optional .scope qualifier specifies the set of threads that can directly observe the memory\n\nsynchronizing effect of this operation, as described in Memory Consistency Model. If the .scope qualifier is not specified for\n\nmultimem.red then .sys scope is assumed by default.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.1.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\nmultimem.ld_reduce.and.b32                    va ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red"
            };

        case "nanosleep":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep\" target=\"_blank\" rel=\"noopener noreferrer\">nanosleep <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: nanosleep</h1><section id=\"miscellaneous-instructions-nanosleep\">\n\n\n<p>Suspend the thread for an approximate delay given in nanoseconds.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>nanosleep.u32 t;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Suspends the thread for a sleep duration approximately close to the delay <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, specified in\nnanoseconds. <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code> may be a register or an immediate value.</p>\n<p>The sleep duration is approximated, but guaranteed to be in the interval <code class=\"docutils literal notranslate\"><span class=\"pre\">[0,</span> <span class=\"pre\">2*t]</span></code>. The maximum\nsleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual\nthreads within a warp such that all sleeping threads in the warp wake up together.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">nanosleep</span></code> introduced in PTX ISA 6.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">nanosleep</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 r;\n.reg .pred p;\n\nnanosleep.u32 r;\nnanosleep.u32 42;\n@p nanosleep.u32 r;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Suspend the thread for an approximate delay given in nanoseconds.\n\nSyntax\n\nnanosleep.u32 t;\n\nDescription\n\nSuspends the thread for a sleep duration approximately close to the delay t, specified in\n\nnanoseconds. t may be a register or an immediate value.\n\nThe sleep duration is approximated, but guaranteed to be in the interval [0, 2*t]. The maximum\n\nsleep duration is 1 millisecond. The implementation may reduce the sleep duration for individual\n\nthreads within a warp such that all sleeping threads in the warp wake up together.\n\nPTX ISA Notes\n\nnanosleep introduced in PTX ISA 6.3.\n\nTarget ISA Notes\n\nnanosleep requires sm_70 or higher.\n\nExamples\n\n.reg .b32 r;\n\n.reg .pred p;\n\nnanosleep.u32 r;\n\nnanosleep.u32 42;\n\n@p nanosleep.u32 r;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep"
            };

        case "nclusterid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid\" target=\"_blank\" rel=\"noopener noreferrer\">nclusterid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nclusterid</h1><section id=\"special-registers-nclusterid\">\n\n\n<p>Number of cluster identifiers per grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %nclusterid;\n.sreg .u32 %nclusterid.x, %nclusterid.y, %nclusterid.z;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of clusters in each grid\ndimension.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">%nclusterid</span></code> special register contains a 3D grid shape vector that holds the grid dimensions\nin terms of clusters. The fourth element is unused and always returns zero.</p>\n<p>Refer to the <em>Cuda Programming Guide</em> for details on the maximum values of <code class=\"docutils literal notranslate\"><span class=\"pre\">%nclusterid.{x,y,z}</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %r&lt;2&gt;;\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %nclusterid.x;\nmov.u32     %r1, %nclusterid.z;\nmov.v4.u32  %rx, %nclusterid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of cluster identifiers per grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %nclusterid;\n\n.sreg .u32 %nclusterid.x, %nclusterid.y, %nclusterid.z;\n\nDescription\n\nA predefined, read-only special register initialized with the number of clusters in each grid\n\ndimension.\n\nThe %nclusterid special register contains a 3D grid shape vector that holds the grid dimensions\n\nin terms of clusters. The fourth element is unused and always returns zero.\n\nRefer to the Cuda Programming Guide for details on the maximum values of %nclusterid.{x,y,z}.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.reg .b32 %r<2>;\n\n.reg .v4 .b32 %rx;\n\nmov.u32     %r0, %nclusterid.x;\n\nmov.u32     %r1, %nclusterid.z;\n\nmov.v4.u32  %rx, %nclusterid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nclusterid"
            };

        case "nctaid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nctaid\" target=\"_blank\" rel=\"noopener noreferrer\">nctaid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nctaid</h1><section id=\"special-registers-nctaid\">\n\n\n<p>Number of CTA ids per grid.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %nctaid                      // Grid shape vector\n.sreg .u32 %nctaid.x,%nctaid.y,%nctaid.z;   // Grid dimensions\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of CTAs in each grid\ndimension. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%nctaid</span></code> special register contains a 3D grid shape vector, with each element\nhaving a value of at least <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code>. The fourth element is unused and always returns zero.</p>\n<p>Maximum values of %nctaid.{x,y,z} are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 54%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 13%\"/>\n<col style=\"width: 13%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>.target architecture</p></th>\n<th class=\"head\"><p>%nctaid.x</p></th>\n<th class=\"head\"><p>%nctaid.y</p></th>\n<th class=\"head\"><p>%nctaid.z</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code></p></td>\n<td><p>65535</p></td>\n<td><p>65535</p></td>\n<td><p>65535</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_3x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_5x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_7x</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code></p></td>\n<td><p>2<sup>31</sup> -1</p></td>\n<td><p>65535</p></td>\n<td><p>65535</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%nctaid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r0,%nctaid.x;\nmov.u16  %rh,%nctaid.x;     // legacy code\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of CTA ids per grid.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %nctaid                      // Grid shape vector\n\n.sreg .u32 %nctaid.x,%nctaid.y,%nctaid.z;   // Grid dimensions\n\nDescription\n\nA predefined, read-only special register initialized with the number of CTAs in each grid\n\ndimension. The %nctaid special register contains a 3D grid shape vector, with each element\n\nhaving a value of at least 1. The fourth element is unused and always returns zero.\n\nMaximum values of %nctaid.{x,y,z} are as follows:\n\n\n\n\n\n.target architecture\n\n%nctaid.x\n\n%nctaid.y\n\n%nctaid.z\n\nsm_1x, sm_20\n\n65535\n\n65535\n\n65535\n\nsm_3x, sm_5x, sm_6x, sm_7x,\n\nsm_8x, sm_9x\n\n231 -1\n\n65535\n\n65535\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%nctaid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r0,%nctaid.x;\n\nmov.u16  %rh,%nctaid.x;     // legacy code\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nctaid"
            };

        case "neg":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg\" target=\"_blank\" rel=\"noopener noreferrer\">neg(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg\" target=\"_blank\" rel=\"noopener noreferrer\">neg(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg\" target=\"_blank\" rel=\"noopener noreferrer\">neg(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: neg</h1><section id=\"floating-point-instructions-neg\">\n\n\n<p>Arithmetic negate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>neg{.ftz}.f32  d, a;\nneg.f64        d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Negate the sign of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = -a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs yield an unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>neg.ftz.f32  x,f0;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: neg</h1><section id=\"half-precision-floating-point-instructions-neg\">\n\n\n<p>Arithmetic negate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>neg{.ftz}.f16    d, a;\nneg{.ftz}.f16x2  d, a;\nneg.bf16         d, a;\nneg.bf16x2       d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Negate the sign of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vector by extracting half word values\nfrom the source operand. Half-word operands are then negated in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> result in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code> instruction\ntype, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>\nand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = -a;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = -fA[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">neg.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> inputs yield an unspecified <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>. Future implementations may comply with the IEEE 754\nstandard by preserving payload and modifying only the sign bit.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16x2</span></code> introduced in PTX ISA 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">neg.bf16x2</span></code> requires architecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>neg.ftz.f16  x,f0;\nneg.bf16     x,b0;\nneg.bf16x2   x1,b1;\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: neg</h1><section id=\"integer-arithmetic-instructions-neg\">\n\n\n<p>Arithmetic negate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>neg.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Negate the sign of <strong>a</strong> and store the result in <strong>d</strong>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = -a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Only for signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>neg.s32  r0,a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg{.ftz}.f32  d, a;\n\nneg.f64        d, a;\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nSemantics\n\nd = -a;\n\nNotes\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nneg.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xneg.f64 supports subnormal numbers.\n\nneg.f32 flushes subnormal inputs and results to sign-preserving...\n\n=====Half Precision Floating Point Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg{.ftz}.f16    d, a;\n\nneg{.ftz}.f16x2  d, a;\n\nneg.bf16         d, a;\n\nneg.bf16x2       d, a;\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vector by extracting half word values\n\nfrom the source operand. Half-word operands are then negated in parallel to produce .f16x2 or\n\n.bf16x2 result in destination.\n\nFor .f...\n\n=====Integer Arithmetic Instructions: neg\n\n\n\nArithmetic negate.\n\nSyntax\n\nneg.type  d, a;\n\n.type = { .s16, .s32, .s64 };\n\nDescription\n\nNegate the sign of a and store the result in d.\n\nSemantics\n\nd = -a;\n\nNotes\n\nOnly for signed integers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nneg.s32  r0,a;\n\n... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg"
            };

        case "noreturn":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-noreturn\" target=\"_blank\" rel=\"noopener noreferrer\">noreturn <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .noreturn</h1><section id=\"performance-tuning-directives-noreturn\">\n\n\n<p>Indicate that the function does not return to its caller function.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.noreturn\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Indicate that the function does not return to its caller function.</p>\n<p><strong>Semantics</strong></p>\n<p>An optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive indicates that the function does not return to caller\nfunction. <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive can only be specified on device functions and must appear between\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.func</span></code> directive and its body.</p>\n<p>The directive cannot be specified on functions which have return parameters.</p>\n<p>If a function with <code class=\"docutils literal notranslate\"><span class=\"pre\">.noreturn</span></code> directive returns to the caller function at runtime, then the\nbehavior is undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.4.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.func foo .noreturn { ... }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Indicate that the function does not return to its caller function.\n\nSyntax\n\n.noreturn\n\nDescription\n\nIndicate that the function does not return to its caller function.\n\nSemantics\n\nAn optional .noreturn directive indicates that the function does not return to caller\n\nfunction. .noreturn directive can only be specified on device functions and must appear between\n\na .func directive and its body.\n\nThe directive cannot be specified on functions which have return parameters.\n\nIf a function with .noreturn directive returns to the caller function at runtime, then the\n\nbehavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.4.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\n.func foo .noreturn { ... }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-noreturn"
            };

        case "not":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not\" target=\"_blank\" rel=\"noopener noreferrer\">not <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: not</h1><section id=\"logic-and-shift-instructions-not\">\n\n\n<p>Bitwise negation; one\u2019s complement.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>not.type d, a;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Invert the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = ~a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicates.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>not.b32  mask,mask;\nnot.pred  p,q;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bitwise negation; one\u2019s complement.\n\nSyntax\n\nnot.type d, a;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nInvert the bits in a.\n\nSemantics\n\nd = ~a;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicates.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nnot.b32  mask,mask;\n\nnot.pred  p,q;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-not"
            };

        case "nsmid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nsmid\" target=\"_blank\" rel=\"noopener noreferrer\">nsmid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nsmid</h1><section id=\"special-registers-nsmid\">\n\n\n<p>Number of SM identifiers.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %nsmid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the maximum number of SM identifiers. The SM\nidentifier numbering is not guaranteed to be contiguous, so <code class=\"docutils literal notranslate\"><span class=\"pre\">%nsmid</span></code> may be larger than the\nphysical number of SMs in the device.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%nsmid</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %nsmid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of SM identifiers.\n\nSyntax (predefined)\n\n.sreg .u32 %nsmid;\n\nDescription\n\nA predefined, read-only special register that returns the maximum number of SM identifiers. The SM\n\nidentifier numbering is not guaranteed to be contiguous, so %nsmid may be larger than the\n\nphysical number of SMs in the device.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%nsmid requires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %nsmid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nsmid"
            };

        case "ntid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ntid\" target=\"_blank\" rel=\"noopener noreferrer\">ntid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %ntid</h1><section id=\"special-registers-ntid\">\n\n\n<p>Number of thread IDs per CTA.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %ntid;                   // CTA shape vector\n.sreg .u32 %ntid.x, %ntid.y, %ntid.z;   // CTA dimensions\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with the number of thread ids in each CTA\ndimension. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code> special register contains a 3D CTA shape vector that holds the CTA\ndimensions. CTA dimensions are non-zero; the fourth element is unused and always returns zero. The\ntotal number of threads in a CTA is <code class=\"docutils literal notranslate\"><span class=\"pre\">(%ntid.x</span> <span class=\"pre\">*</span> <span class=\"pre\">%ntid.y</span> <span class=\"pre\">*</span> <span class=\"pre\">%ntid.z)</span></code>.</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>%ntid.y == %ntid.z == 1 in 1D CTAs.\n%ntid.z ==1 in 2D CTAs.\n</pre></div>\n</div>\n<p>Maximum values of %ntid.{x,y,z} are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 12%\"/>\n<col style=\"width: 12%\"/>\n<col style=\"width: 12%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>.target architecture</p></th>\n<th class=\"head\"><p>%ntid.x</p></th>\n<th class=\"head\"><p>%ntid.y</p></th>\n<th class=\"head\"><p>%ntid.z</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></p></td>\n<td><p>512</p></td>\n<td><p>512</p></td>\n<td><p>64</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_3x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_5x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_7x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code></p></td>\n<td><p>1024</p></td>\n<td><p>1024</p></td>\n<td><p>64</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// compute unified thread id for 2D CTA\nmov.u32  %r0,%tid.x;\nmov.u32  %h1,%tid.y;\nmov.u32  %h2,%ntid.x;\nmad.u32  %r0,%h1,%h2,%r0;\n\nmov.u16  %rh,%ntid.x;      // legacy code\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of thread IDs per CTA.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %ntid;                   // CTA shape vector\n\n.sreg .u32 %ntid.x, %ntid.y, %ntid.z;   // CTA dimensions\n\nDescription\n\nA predefined, read-only special register initialized with the number of thread ids in each CTA\n\ndimension. The %ntid special register contains a 3D CTA shape vector that holds the CTA\n\ndimensions. CTA dimensions are non-zero; the fourth element is unused and always returns zero. The\n\ntotal number of threads in a CTA is (%ntid.x * %ntid.y * %ntid.z).\n\n%ntid.y == %ntid.z == 1 in 1D CTAs.\n\n%ntid.z ==1 in 2D CTAs.\n\nMaximum values of %ntid.{x,y,z} are as follows:\n\n\n\n\n\n.target architecture\n\n%ntid.x\n\n%ntid.y\n\n%ntid.z\n\nsm_1x\n\n512\n\n512\n\n64\n\nsm_20, sm_3x, sm_5x, sm_6x,\n\nsm_7x, sm_8x, sm_9x\n\n1024\n\n1024\n\n64\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%ntid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n// compute unified thread id for 2D CTA\n\nmov.u32  %r0,%tid.x;\n\nmov.u32  %h1,%tid.y;\n\nmov.u32  %h2,%ntid.x;\n\nmad.u32  %r0,%h1,%h2,%r0;\n\nmov.u16  %rh,%ntid.x;      // legacy code\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-ntid"
            };

        case "nwarpid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nwarpid\" target=\"_blank\" rel=\"noopener noreferrer\">nwarpid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %nwarpid</h1><section id=\"special-registers-nwarpid\">\n\n\n<p>Number of warp identifiers.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %nwarpid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the maximum number of warp identifiers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%nwarpid</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %nwarpid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of warp identifiers.\n\nSyntax (predefined)\n\n.sreg .u32 %nwarpid;\n\nDescription\n\nA predefined, read-only special register that returns the maximum number of warp identifiers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\n%nwarpid requires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %nwarpid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-nwarpid"
            };

        case "or":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or\" target=\"_blank\" rel=\"noopener noreferrer\">or <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: or</h1><section id=\"logic-and-shift-instructions-or\">\n\n\n<p>Biwise OR.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>or.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the bit-wise or operation for the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a | b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicate registers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>or.b32  mask mask,0x00010001\nor.pred  p,q,r;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Biwise OR.\n\nSyntax\n\nor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise or operation for the bits in a and b.\n\nSemantics\n\nd = a | b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nor.b32  mask mask,0x00010001\n\nor.pred  p,q,r;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-or"
            };

        case "pm0":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-pm7\" target=\"_blank\" rel=\"noopener noreferrer\">pm0..%pm7 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %pm0..%pm7</h1><section id=\"special-registers-pm0-pm7\">\n\n\n<p>Performance monitoring counters.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %pm&lt;8&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm7</span></code> are unsigned 32-bit read-only performance monitor counters. Their\nbehavior is currently undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm3</span></code> introduced in PTX ISA version 1.3.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm4..%pm7</span></code> introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm3</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm4..%pm7</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  r1,%pm0;\nmov.u32  r1,%pm7;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Performance monitoring counters.\n\nSyntax (predefined)\n\n.sreg .u32 %pm<8>;\n\nDescription\n\nSpecial registers %pm0..%pm7 are unsigned 32-bit read-only performance monitor counters. Their\n\nbehavior is currently undefined.\n\nPTX ISA Notes\n\n%pm0..%pm3 introduced in PTX ISA version 1.3.\n\n%pm4..%pm7 introduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\n%pm0..%pm3 supported on all target architectures.\n\n%pm4..%pm7 require sm_20 or higher.\n\nExamples\n\nmov.u32  r1,%pm0;\n\nmov.u32  r1,%pm7;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-pm7"
            };

        case "pm0_64":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-64-pm7-64\" target=\"_blank\" rel=\"noopener noreferrer\">pm0_64..%pm7_64 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %pm0_64..%pm7_64</h1><section id=\"special-registers-pm0-64-pm7-64\">\n<span id=\"id12\"></span>\n\n<p>64 bit Performance monitoring counters.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u64 %pm0_64;\n.sreg .u64 %pm1_64;\n.sreg .u64 %pm2_64;\n.sreg .u64 %pm3_64;\n.sreg .u64 %pm4_64;\n.sreg .u64 %pm5_64;\n.sreg .u64 %pm6_64;\n.sreg .u64 %pm7_64;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Special registers <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> are unsigned 64-bit read-only performance monitor\ncounters. Their behavior is currently undefined.</p>\n<p><strong>Notes</strong></p>\n<p>The lower 32bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> are identical to <code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0..%pm7</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> introduced in PTX ISA version 4.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%pm0_64..%pm7_64</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  r1,%pm0_64;\nmov.u32  r1,%pm7_64;\n</pre></div>\n</div>\n</section>",
                "tooltip": "64 bit Performance monitoring counters.\n\nSyntax (predefined)\n\n.sreg .u64 %pm0_64;\n\n.sreg .u64 %pm1_64;\n\n.sreg .u64 %pm2_64;\n\n.sreg .u64 %pm3_64;\n\n.sreg .u64 %pm4_64;\n\n.sreg .u64 %pm5_64;\n\n.sreg .u64 %pm6_64;\n\n.sreg .u64 %pm7_64;\n\nDescription\n\nSpecial registers %pm0_64..%pm7_64 are unsigned 64-bit read-only performance monitor\n\ncounters. Their behavior is currently undefined.\n\nNotes\n\nThe lower 32bits of %pm0_64..%pm7_64 are identical to %pm0..%pm7.\n\nPTX ISA Notes\n\n%pm0_64..%pm7_64 introduced in PTX ISA version 4.0.\n\nTarget ISA Notes\n\n%pm0_64..%pm7_64 require sm_50 or higher.\n\nExamples\n\nmov.u32  r1,%pm0_64;\n\nmov.u32  r1,%pm7_64;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-pm0-64-pm7-64"
            };

        case "pmevent":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent\" target=\"_blank\" rel=\"noopener noreferrer\">pmevent <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: pmevent</h1><section id=\"miscellaneous-instructions-pmevent\">\n\n\n<p>Trigger one or more Performance Monitor events.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>pmevent       a;    // trigger a single performance monitor event\npmevent.mask  a;    // trigger one or more performance monitor events\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Triggers one or more of a fixed number of performance monitor events, with event index or mask\nspecified by immediate operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent</span></code> (without modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.mask</span></code>) triggers a single performance monitor event indexed by\nimmediate operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..15</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent.mask</span></code> triggers one or more of the performance monitor events. Each bit in the 16-bit\nimmediate operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> controls an event.</p>\n<p>Programmatic performance moniter events may be combined with other hardware events using Boolean\nfunctions to increment one of the four performance counters. The relationship between events and\ncounters is programmed via API calls from the host.</p>\n<p><strong>Notes</strong></p>\n<p>Currently, there are sixteen performance monitor events, numbered 0 through 15.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent</span></code> introduced in PTX ISA version 1.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent.mask</span></code> introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>pmevent supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">pmevent.mask</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    pmevent      1;\n@p  pmevent      7;\n@q  pmevent.mask 0xff;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Trigger one or more Performance Monitor events.\n\nSyntax\n\npmevent       a;    // trigger a single performance monitor event\n\npmevent.mask  a;    // trigger one or more performance monitor events\n\nDescription\n\nTriggers one or more of a fixed number of performance monitor events, with event index or mask\n\nspecified by immediate operand a.\n\npmevent (without modifier .mask) triggers a single performance monitor event indexed by\n\nimmediate operand a, in the range 0..15.\n\npmevent.mask triggers one or more of the performance monitor events. Each bit in the 16-bit\n\nimmediate operand a controls an event.\n\nProgrammatic performance moniter events may be combined with other hardware events using Boolean\n\nfunctions to increment one of the four performance counters. The relationship between events and\n\ncounters is programmed via API calls from the host.\n\nNotes\n\nCurrently, there are sixteen performance monitor events, numbered 0 through 15.\n\nPTX ISA Notes\n\npmevent introduced in PTX ISA version 1.4.\n\npmevent.mask introduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\npmevent supported on all target architectures.\n\npmevent.mask requires sm_20 or higher.\n\nExamples\n\n    pmevent      1;\n\n@p  pmevent      7;\n\n@q  pmevent.mask 0xff;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-pmevent"
            };

        case "popc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc\" target=\"_blank\" rel=\"noopener noreferrer\">popc(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: popc</h1><section id=\"integer-arithmetic-instructions-popc\">\n\n\n<p>Population count.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>popc.type  d, a;\n\n.type = { .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Count the number of one bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and place the resulting <em>population count</em> in 32-bit\ndestination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the instruction type and destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has type\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.u32  d = 0;\nwhile (a != 0) {\n   if (a &amp; 0x1)  d++;\n   a = a &gt;&gt; 1;\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">popc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>popc.b32  d, a;\npopc.b64  cnt, X;  // cnt is .u32\n</pre></div>\n</div>\n</section>",
                "tooltip": "Population count.\n\nSyntax\n\npopc.type  d, a;\n\n.type = { .b32, .b64 };\n\nDescription\n\nCount the number of one bits in a and place the resulting population count in 32-bit\n\ndestination register d. Operand a has the instruction type and destination d has type\n\n.u32.\n\nSemantics\n\n.u32  d = 0;\n\nwhile (a != 0) {\n\n   if (a & 0x1)  d++;\n\n   a = a >> 1;\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\npopc requires sm_20 or higher.\n\nExamples\n\npopc.b32  d, a;\n\npopc.b64  cnt, X;  // cnt is .u32\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc"
            };

        case "pragma":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-pragma\" target=\"_blank\" rel=\"noopener noreferrer\">pragma <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .pragma</h1><section id=\"performance-tuning-directives-pragma\">\n\n\n<p>Pass directives to PTX backend compiler.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.pragma list-of-strings ;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Pass module-scoped, entry-scoped, or statement-level directives to the PTX backend compiler.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.pragma</span></code> directive may occur at module-scope, at entry-scope, or at statement-level.</p>\n<p><strong>Semantics</strong></p>\n<p>The interpretation of <code class=\"docutils literal notranslate\"><span class=\"pre\">.pragma</span></code> directive strings is implementation-specific and has no impact on\nPTX semantics. See <a class=\"reference external\" href=\"#descriptions-pragma-strings\">Descriptions of .pragma Strings</a> for\ndescriptions of the pragma strings defined in <code class=\"docutils literal notranslate\"><span class=\"pre\">ptxas</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.pragma \"nounroll\";    // disable unrolling in backend\n\n// disable unrolling for current kernel\n.entry foo .pragma \"nounroll\"; { ... }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Pass directives to PTX backend compiler.\n\nSyntax\n\n.pragma list-of-strings ;\n\nDescription\n\nPass module-scoped, entry-scoped, or statement-level directives to the PTX backend compiler.\n\nThe .pragma directive may occur at module-scope, at entry-scope, or at statement-level.\n\nSemantics\n\nThe interpretation of .pragma directive strings is implementation-specific and has no impact on\n\nPTX semantics. See Descriptions of .pragma Strings for\n\ndescriptions of the pragma strings defined in ptxas.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.pragma \"nounroll\";    // disable unrolling in backend\n\n// disable unrolling for current kernel\n\n.entry foo .pragma \"nounroll\"; { ... }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-pragma"
            };

        case "prefetch":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu\" target=\"_blank\" rel=\"noopener noreferrer\">prefetch <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: prefetch, prefetchu</h1><section id=\"data-movement-and-conversion-instructions-prefetch-prefetchu\">\n\n\n<p>Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\nstate space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>prefetch{.space}.level                    [a];   // prefetch to data cache\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n.level =                    { .L1, .L2 };\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n.tensormap_space =          { .const, .param };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> instruction brings the cache line containing the specified address in global or\nlocal memory state space into the specified cache level.</p>\n<p>If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier is specified then the <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> instruction brings the cache line\ncontaining the specified address in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> memory state space for subsequent\nuse by the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> instruction.</p>\n<p>If no state space is given, the <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> uses <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>Optionally, the eviction priority to be applied on the prefetched cache line can be specified by the\nmodifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code>.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetchu</span></code> instruction brings the cache line containing the specified generic address into\nthe specified uniform cache level.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> to a shared memory location performs no operation.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> into the uniform cache requires a generic address, and no operation occurs if the\naddress maps to a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> memory location.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier introduced in PTX ISA version 7.4.</p>\n<p>Support for the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier is introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetchu</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>prefetch.global.L1             [ptr];\nprefetch.global.L2::evict_last [ptr];\nprefetchu.L1  [addr];\nprefetch.global.tensormap      [ptr];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\n\nstate space.\n\nSyntax\n\nprefetch{.space}.level                    [a];   // prefetch to data cache\n\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n\n.level =                    { .L1, .L2 };\n\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n\n.tensormap_space =          { .const, .param };\n\nDescription\n\nThe prefetch instruction brings the cache line containing the specified address in global or\n\nlocal memory state space into the specified cache level.\n\nIf the .tensormap qualifier is specified then the prefetch instruction brings the cache line\n\ncontaining the specified address in the .const or .param memory state space for subsequent\n\nuse by the cp.async.bulk.tensor instruction.\n\nIf no state space is given, the prefetch uses Generic Addressing.\n\nOptionally, the eviction priority to be applied on the prefetched cache line can be specified by the\n\nmodifier .level::eviction_priority.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nThe prefetchu instruction brings the cache line containing the specified generic address into\n\nthe specified uniform cache level.\n\nA prefetch to a shared memory location performs no operation.\n\nA prefetch into the uniform cache requires a generic address, and no operation occurs if the\n\naddress maps to a const, local, or shared memory location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nSupport for .level::eviction_priority qualifier introduced in PTX ISA version 7.4.\n\nSupport for the .tensormap qualifier is introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nprefetch and prefetchu require sm_20 or higher.\n\nSupport for .level::eviction_priority qualifier requires sm_80 or higher.\n\nSupport for the .tensormap qualifier requires sm_90 or higher.\n\nExamples\n\nprefetch.global.L1             [ptr];\n\nprefetch.global.L2::evict_last [ptr];\n\nprefetchu.L1  [addr];\n\nprefetch.global.tensormap      [ptr];\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu"
            };

        case "prefetchu":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu\" target=\"_blank\" rel=\"noopener noreferrer\">prefetchu <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: prefetch, prefetchu</h1><section id=\"data-movement-and-conversion-instructions-prefetch-prefetchu\">\n\n\n<p>Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\nstate space.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>prefetch{.space}.level                    [a];   // prefetch to data cache\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n.level =                    { .L1, .L2 };\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n.tensormap_space =          { .const, .param };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> instruction brings the cache line containing the specified address in global or\nlocal memory state space into the specified cache level.</p>\n<p>If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier is specified then the <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> instruction brings the cache line\ncontaining the specified address in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.const</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.param</span></code> memory state space for subsequent\nuse by the <code class=\"docutils literal notranslate\"><span class=\"pre\">cp.async.bulk.tensor</span></code> instruction.</p>\n<p>If no state space is given, the <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> uses <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>.</p>\n<p>Optionally, the eviction priority to be applied on the prefetched cache line can be specified by the\nmodifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code>.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetchu</span></code> instruction brings the cache line containing the specified generic address into\nthe specified uniform cache level.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> to a shared memory location performs no operation.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> into the uniform cache requires a generic address, and no operation occurs if the\naddress maps to a <code class=\"docutils literal notranslate\"><span class=\"pre\">const</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">local</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">shared</span></code> memory location.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier introduced in PTX ISA version 7.4.</p>\n<p>Support for the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier is introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">prefetch</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">prefetchu</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for the <code class=\"docutils literal notranslate\"><span class=\"pre\">.tensormap</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>prefetch.global.L1             [ptr];\nprefetch.global.L2::evict_last [ptr];\nprefetchu.L1  [addr];\nprefetch.global.tensormap      [ptr];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Prefetch line containing a generic address at a specified level of memory hierarchy, in specified\n\nstate space.\n\nSyntax\n\nprefetch{.space}.level                    [a];   // prefetch to data cache\n\nprefetch.global.level::eviction_priority  [a];   // prefetch to data cache\n\nprefetchu.L1  [a];             // prefetch to uniform cache\n\nprefetch{.tensormap_space}.tensormap [a];  // prefetch the tensormap\n\n.space =                    { .global, .local };\n\n.level =                    { .L1, .L2 };\n\n.level::eviction_priority = { .L2::evict_last, .L2::evict_normal };\n\n.tensormap_space =          { .const, .param };\n\nDescription\n\nThe prefetch instruction brings the cache line containing the specified address in global or\n\nlocal memory state space into the specified cache level.\n\nIf the .tensormap qualifier is specified then the prefetch instruction brings the cache line\n\ncontaining the specified address in the .const or .param memory state space for subsequent\n\nuse by the cp.async.bulk.tensor instruction.\n\nIf no state space is given, the prefetch uses Generic Addressing.\n\nOptionally, the eviction priority to be applied on the prefetched cache line can be specified by the\n\nmodifier .level::eviction_priority.\n\nSupported addressing modes for operand a and alignment requirements are described in Addresses\n\nas Operands\n\nThe prefetchu instruction brings the cache line containing the specified generic address into\n\nthe specified uniform cache level.\n\nA prefetch to a shared memory location performs no operation.\n\nA prefetch into the uniform cache requires a generic address, and no operation occurs if the\n\naddress maps to a const, local, or shared memory location.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nSupport for .level::eviction_priority qualifier introduced in PTX ISA version 7.4.\n\nSupport for the .tensormap qualifier is introduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nprefetch and prefetchu require sm_20 or higher.\n\nSupport for .level::eviction_priority qualifier requires sm_80 or higher.\n\nSupport for the .tensormap qualifier requires sm_90 or higher.\n\nExamples\n\nprefetch.global.L1             [ptr];\n\nprefetch.global.L2::evict_last [ptr];\n\nprefetchu.L1  [addr];\n\nprefetch.global.tensormap      [ptr];\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prefetch-prefetchu"
            };

        case "prmt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt\" target=\"_blank\" rel=\"noopener noreferrer\">prmt <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: prmt</h1><section id=\"data-movement-and-conversion-instructions-prmt\">\n\n\n<p>Permute bytes from register pair.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>prmt.b32{.mode}  d, a, b, c;\n\n.mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination\nregister.</p>\n<p>In the generic form (no mode specified), the permute control consists of four 4-bit selection\nvalues. The bytes in the two source registers are numbered from 0 to 7: <code class=\"docutils literal notranslate\"><span class=\"pre\">{b,</span> <span class=\"pre\">a}</span> <span class=\"pre\">=</span> <span class=\"pre\">{{b7,</span> <span class=\"pre\">b6,</span> <span class=\"pre\">b5,</span>\n<span class=\"pre\">b4},</span> <span class=\"pre\">{b3,</span> <span class=\"pre\">b2,</span> <span class=\"pre\">b1,</span> <span class=\"pre\">b0}}</span></code>. For each byte in the target register, a 4-bit selection value is defined.</p>\n<p>The 3 lsbs of the selection value specify which of the 8 source bytes should be moved into the\ntarget position. The msb defines if the byte value should be copied, or if the sign (msb of the\nbyte) should be replicated over all 8 bits of the target position (sign extend of the byte value);\n<code class=\"docutils literal notranslate\"><span class=\"pre\">msb=0</span></code> means copy the literal value; <code class=\"docutils literal notranslate\"><span class=\"pre\">msb=1</span></code> means replicate the sign. Note that the sign\nextension is only performed as part of generic form.</p>\n<p>Thus, the four 4-bit values fully specify an arbitrary byte permute, as a <code class=\"docutils literal notranslate\"><span class=\"pre\">16b</span></code> permute code.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 19%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n<col style=\"width: 20%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>default mode</p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b3</span></code></p>\n<p>source select</p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b2</span></code></p>\n<p>source select</p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b1</span></code></p>\n<p>source select</p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b0</span></code></p>\n<p>source select</p>\n</th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>index</p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">c[15:12]</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">c[11:8]</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">c[7:4]</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">c[3:0]</span></code></p></td>\n</tr>\n</tbody>\n</table>\n<p>The more specialized form of the permute control uses the two lsb\u2019s of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> (which is\ntypically an address pointer) to control the byte extraction.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 35%\"/>\n<col style=\"width: 14%\"/>\n<col style=\"width: 13%\"/>\n<col style=\"width: 13%\"/>\n<col style=\"width: 13%\"/>\n<col style=\"width: 13%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>mode</p></th>\n<th class=\"head\"><p>selector</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">c[1:0]</span></code></p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b3</span></code></p>\n<p>source</p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b2</span></code></p>\n<p>source</p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b1</span></code></p>\n<p>source</p>\n</th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">d.b0</span></code></p>\n<p>source</p>\n</th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">f4e</span></code> (forward 4 extract)</p></td>\n<td><p>0</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>1</p></td>\n<td><p>4</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td></td>\n<td><p>2</p></td>\n<td><p>5</p></td>\n<td><p>4</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>3</p></td>\n<td><p>6</p></td>\n<td><p>5</p></td>\n<td><p>4</p></td>\n<td><p>3</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">b4e</span></code> (backward 4 extract)</p></td>\n<td><p>0</p></td>\n<td><p>5</p></td>\n<td><p>6</p></td>\n<td><p>7</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>1</p></td>\n<td><p>6</p></td>\n<td><p>7</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td></td>\n<td><p>2</p></td>\n<td><p>7</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>2</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>3</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>2</p></td>\n<td><p>3</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">rc8</span></code> (replicate 8)</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">ecl</span></code> (edge clamp left)</p></td>\n<td><p>0</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>1</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n</tr>\n<tr class=\"row-even\"><td></td>\n<td><p>2</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">ecr</span></code> (edge clamp right)</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-even\"><td></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">rc16</span></code> (replicate 16)</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>1</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n</tr>\n<tr class=\"row-even\"><td></td>\n<td><p>2</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n<td><p>1</p></td>\n<td><p>0</p></td>\n</tr>\n<tr class=\"row-odd\"><td></td>\n<td><p>3</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n<td><p>3</p></td>\n<td><p>2</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tmp64 = (b&lt;&lt;32) | a;  // create 8 byte source\n\nif ( ! mode ) {\n   ctl[0] = (c &gt;&gt;  0) &amp; 0xf;\n   ctl[1] = (c &gt;&gt;  4) &amp; 0xf;\n   ctl[2] = (c &gt;&gt;  8) &amp; 0xf;\n   ctl[3] = (c &gt;&gt; 12) &amp; 0xf;\n} else {\n   ctl[0] = ctl[1] = ctl[2] = ctl[3] = (c &gt;&gt;  0) &amp; 0x3;\n}\n\ntmp[07:00] = ReadByte( mode, ctl[0], tmp64 );\ntmp[15:08] = ReadByte( mode, ctl[1], tmp64 );\ntmp[23:16] = ReadByte( mode, ctl[2], tmp64 );\ntmp[31:24] = ReadByte( mode, ctl[3], tmp64 );\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">prmt</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>prmt.b32      r1, r2, r3, r4;\nprmt.b32.f4e  r1, r2, r3, r4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Permute bytes from register pair.\n\nSyntax\n\nprmt.b32{.mode}  d, a, b, c;\n\n.mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };\n\nDescription\n\nPick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination\n\nregister.\n\nIn the generic form (no mode specified), the permute control consists of four 4-bit selection\n\nvalues. The bytes in the two source registers are numbered from 0 to 7: {b, a} = {{b7, b6, b5,\n\nb4}, {b3, b2, b1, b0}}. For each byte in the target register, a 4-bit selection value is defined.\n\nThe 3 lsbs of the selection value specify which of the 8 source bytes should be moved into the\n\ntarget position. The msb defines if the byte value should be copied, or if the sign (msb of the\n\nbyte) should be replicated over all 8 bits of the target position (sign extend of the byte value);\n\nmsb=0 means copy the literal value; msb=1 means replicate the sign. Note that the sign\n\nextension is only performed as part of generic form.\n\nThus, the four 4-bit values fully specify an arbitrary byte permute, as a 16b permute code.\n\n\n\n\n\ndefault mode\n\nd.b3\n\nsource select\n\nd.b2\n\nsource select\n\nd.b1\n\nsource select\n\nd.b0\n\nsource select\n\n\n\nindex\n\nc[15:12]\n\nc[11:8]\n\nc[7:4]\n\nc[3:0]\n\nThe more specialized form of the permute control uses the two lsb\u2019s of operand c (which is\n\ntypically an address pointer) to control the byte extraction.\n\n\n\n\n\nmode\n\nselector\n\nc[1:0]\n\nd.b3\n\nsource\n\nd.b2\n\nsource\n\nd.b1\n\nsource\n\nd.b0\n\nsource\n\n\n\nf4e (forward 4 extract)\n\n0\n\n3\n\n2\n\n1\n\n0\n\n1\n\n4\n\n3\n\n2\n\n1\n\n2\n\n5\n\n4\n\n3\n\n2\n\n3\n\n6\n\n5\n\n4\n\n3\n\nb4e (backward 4 extract)\n\n0\n\n5\n\n6\n\n7\n\n0\n\n1\n\n6\n\n7\n\n0\n\n1\n\n2\n\n7\n\n0\n\n1\n\n2\n\n3\n\n0\n\n1\n\n2\n\n3\n\nrc8 (replicate 8)\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n1\n\n2\n\n2\n\n2\n\n2\n\n2\n\n3\n\n3\n\n3\n\n3\n\n3\n\necl (edge clamp left)\n\n0\n\n3\n\n2\n\n1\n\n0\n\n1\n\n3\n\n2\n\n1\n\n1\n\n2\n\n3\n\n2\n\n2\n\n2\n\n3\n\n3\n\n3\n\n3\n\n3\n\necr (edge clamp right)\n\n0\n\n0\n\n0\n\n0\n\n0\n\n1\n\n1\n\n1\n\n1\n\n0\n\n2\n\n2\n\n2\n\n1\n\n0\n\n3\n\n3\n\n2\n\n1\n\n0\n\nrc16 (replicate 16)\n\n0\n\n1\n\n0\n\n1\n\n0\n\n1\n\n3\n\n2\n\n3\n\n2\n\n2\n\n1\n\n0\n\n1\n\n0\n\n3\n\n3\n\n2\n\n3\n\n2\n\nSemantics\n\ntmp64 = (b<<32) | a;  // create 8 byte source\n\nif ( ! mode ) {\n\n   ctl[0] = (c >>  0) & 0xf;\n\n   ctl[1] = (c >>  4) & 0xf;\n\n   ctl[2] = (c >>  8) & 0xf;\n\n   ctl[3] = (c >> 12) & 0xf;\n\n} else {\n\n   ctl[0] = ctl[1] = ctl[2] = ctl[3] = (c >>  0) & 0x3;\n\n}\n\ntmp[07:00] = ReadByte( mode, ctl[0], tmp64 );\n\ntmp[15:08] = ReadByte( mode, ctl[1], tmp64 );\n\ntmp[23:16] = ReadByte( mode, ctl[2], tmp64 );\n\ntmp[31:24] = ReadByte( mode, ctl[3], tmp64 );\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nprmt requires sm_20 or higher.\n\nExamples\n\nprmt.b32      r1, r2, r3, r4;\n\nprmt.b32.f4e  r1, r2, r3, r4;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt"
            };

        case "rcp":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp\" target=\"_blank\" rel=\"noopener noreferrer\">rcp(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp-approx-ftz-f64\" target=\"_blank\" rel=\"noopener noreferrer\">rcp.approx.ftz.f64(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: rcp</h1><section id=\"floating-point-instructions-rcp\">\n\n\n<p>Take the reciprocal of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rcp.approx{.ftz}.f32  d, a;  // fast, approximate reciprocal\nrcp.rnd{.ftz}.f32     d, a;  // IEEE 754 compliant rounding\nrcp.rnd.f64           d, a;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute <code class=\"docutils literal notranslate\"><span class=\"pre\">1/a</span></code>, store result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = 1 / a;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><strong>Fast, approximate single-precision reciprocal:</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.approx.f32</span></code> implements a fast approximation to reciprocal. The maximum absolute error is 2<sup>-23.0</sup> over the range 1.0-2.0.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Reciprocal with IEEE 754 compliant rounding:</strong></p>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.f64</span></code> introduced in PTX ISA version 1.0. <code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.rn.f64</span></code> and explicit modifiers\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> were introduced in PTX ISA version 1.4. General rounding modifiers were\nadded in PTX ISA version 2.0.</p>\n<p>For PTX ISA version 1.4 and later, one of <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code> is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.approx.ftz.f32</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.f64</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.rn.f64</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.approx.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.rnd.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.rn.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">map_f64_to_f32.</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.{rz,rm,rp}.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rcp.approx.ftz.f32  ri,r;\nrcp.rn.ftz.f32      xi,x;\nrcp.rn.f64          xi,x;\n</pre></div>\n</div>\n</section>\n<h1>Floating Point Instructions: rcp.approx.ftz.f64</h1><section id=\"floating-point-instructions-rcp-approx-ftz-f64\">\n\n\n<p>Compute a fast, gross approximation to the reciprocal of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rcp.approx.ftz.f64  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute a fast, gross approximation to the reciprocal as follows:</p>\n<ol class=\"arabic simple\">\n<li><p>extract the most-significant 32 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in 1.11.20 IEEE floating-point\nformat (i.e., ignore the least-significant 32 bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>),</p></li>\n<li><p>compute an approximate <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> reciprocal of this value using the most-significant 20 bits of\nthe mantissa of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>,</p></li>\n<li><p>place the resulting 32-bits in 1.11.20 IEEE floating-point format in the most-significant 32-bits\nof destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>,and</p></li>\n<li><p>zero the least significant 32 mantissa bits of <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ol>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tmp = a[63:32]; // upper word of a, 1.11.20 format\nd[63:32] = 1.0 / tmp;\nd[31:0] = 0x00000000;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.approx.ftz.f64</span></code> implements a fast, gross approximation to reciprocal.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 48%\"/>\n<col style=\"width: 52%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input a[63:32]</p></th>\n<th class=\"head\"><p>Result d[63:32]</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>Input <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>s map to a canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> with encoding <code class=\"docutils literal notranslate\"><span class=\"pre\">0x7fffffff00000000</span></code>.</p>\n<p>Subnormal inputs and results are flushed to sign-preserving zero.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.approx.ftz.f64</span></code> introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rcp.approx.ftz.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rcp.ftz.f64  xi,x;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: rcp\n\n\n\nTake the reciprocal of a value.\n\nSyntax\n\nrcp.approx{.ftz}.f32  d, a;  // fast, approximate reciprocal\n\nrcp.rnd{.ftz}.f32     d, a;  // IEEE 754 compliant rounding\n\nrcp.rnd.f64           d, a;  // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute 1/a, store result in d.\n\nSemantics\n\nd = 1 / a;\n\nNotes\n\nFast, approximate single-precision reciprocal:\n\nrcp.approx.f32 implements a fas...\n\n=====Floating Point Instructions: rcp.approx.ftz.f64\n\n\n\nCompute a fast, gross approximation to the reciprocal of a value.\n\nSyntax\n\nrcp.approx.ftz.f64  d, a;\n\nDescription\n\nCompute a fast, gross approximation to the reciprocal as follows:\n\nextract the most-significant 32 bits of .f64 operand a in 1.11.20 IEEE floating-point\n\nformat (i.e., ignore the least-significant 32 bits of a),\n\ncompute an approximate .f64 reciprocal of this value using the most-significant... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rcp"
            };

        case "red":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red\" target=\"_blank\" rel=\"noopener noreferrer\">red <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async\" target=\"_blank\" rel=\"noopener noreferrer\">red.async <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: red</h1><section id=\"parallel-synchronization-and-communication-instructions-red\">\n\n\n<p>Reduction operations on global and shared memory.</p>\n<p><strong>Syntax</strong></p>\n<p>Reduction operation with scalar type:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>red{.sem}{.scope}{.space}.op{.level::cache_hint}.type          [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16    [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2  [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16\n                                                      [a], b {, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.bf16x2\n                                                      [a], b {, cache-policy};\n\n.space =              { .global, .shared{::cta, ::cluster} };\n.sem =                {.relaxed, .release};\n.scope =              {.cta, .cluster, .gpu, .sys};\n\n.op =                 { .and, .or, .xor,\n                        .add, .inc, .dec,\n                        .min, .max };\n.level::cache_hint =  { .L2::cache_hint };\n.type =               { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };\n</pre></div>\n</div>\n<p>Reduction operation with vector type:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>red{.sem}{.scope}{.global}.add{.level::cache_hint}.vec_32_bit.f32 [a], b{, cache-policy};\nred{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}. vec_16_bit.half_word_type [a], b{, cache-policy};\nred{.sem}{.scope}{.global}.op.noftz{.level::cache_hint}.vec_32_bit.packed_type [a], b {, cache-policy};\n\n.sem =                { .relaxed, .release };\n.scope =              { .cta, .gpu, .sys };\n.op =                 { .add, .min, .max };\n.half_word_type =     { .f16, .bf16 };\n.packed_type =        { .f16x2,.bf16x2 };\n.vec_16_bit =         { .v2, .v4, .v8 }\n.vec_32_bit =         { .v2, .v4 };\n.level::cache_hint =  { .L2::cache_hint }\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs a reduction operation with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and the value in location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and stores the\nresult of the specified operation at location <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, overwriting the original value. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>\nspecifies a location in the specified state space. If no state space is given, perform the memory\naccesses using <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a>. <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> with scalar type may\nbe used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> spaces and with generic addressing, where the address\npoints to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> space. <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> with vector type may be used only with\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space and with generic addressing where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> space.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> with vector type, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is brace-enclosed vector expressions, size of which is\nequal to the size of vector qualifier.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier specifies a memory synchronizing effect as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory\nConsistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier is absent,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> is assumed by default.</p>\n<p>The optional <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier specifies the set of threads that can directly observe the memory\nsynchronizing effect of this operation, as described in the <a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. If the <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier is absent, <code class=\"docutils literal notranslate\"><span class=\"pre\">.gpu</span></code> scope is\nassumed by default.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> with vector type, the supported combinations of vector qualifier, types and reduction\noperations supported on these combinations are depicted in following table:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 19%\"/>\n<col style=\"width: 32%\"/>\n<col style=\"width: 32%\"/>\n<col style=\"width: 16%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\" rowspan=\"2\"><p>Vector qualifier</p></th>\n<th class=\"head\" colspan=\"3\"><p>Types</p></th>\n</tr>\n<tr class=\"row-even\"><th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">bf16</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>/ <code class=\"docutils literal notranslate\"><span class=\"pre\">bf16x2</span></code></p></th>\n<th class=\"head\"><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code></p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v2</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v4</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.v8</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code></p></td>\n<td><p>Not supported</p></td>\n<td><p>Not Supported</p></td>\n</tr>\n</tbody>\n</table>\n<p>Two atomic operations {<code class=\"docutils literal notranslate\"><span class=\"pre\">atom</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code>} are performed atomically with respect to each other only\nif each operation specifies a scope that includes the other. When this condition is not met, each\noperation observes the other operation being performed as if it were split into a read followed by a\ndependent write.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> instruction on packed type or vector type, accesses adjacent scalar elements in memory. In\nsuch case, the atomicity is guaranteed separately for each of the individual scalar elements; the\nentire <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> is not guaranteed to be atomic as a single access.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> and earlier architectures, <code class=\"docutils literal notranslate\"><span class=\"pre\">red</span></code> operations on <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space do not\nguarantee atomicity with respect to normal store instructions to the same address. It is the\nprogrammer\u2019s responsibility to guarantee correctness of programs that use shared memory reduction\ninstructions, e.g., by inserting barriers between normal stores and reduction operations to a common\naddress, or by using <code class=\"docutils literal notranslate\"><span class=\"pre\">atom.exch</span></code> to store to locations accessed by other reduction operations.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>The bit-size operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code>.</p>\n<p>The integer operations are <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> operations return a result in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">[0..b]</span></code>.</p>\n<p>The floating-point operation <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code> operation rounds to nearest even. Current implementation of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f32</span></code> on global memory flushes subnormal inputs and results to sign-preserving zero;\nwhereas <code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f32</span></code> on shared memory supports subnormal inputs and results and doesn\u2019t flush\nthem to zero.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f16x2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.bf16x2</span></code> operation requires the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.noftz</span></code> qualifier; it preserves subnormal inputs and results, and does not flush them to zero.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and for generic\naddressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>*a = operation(*a, b);\n\nwhere\n    inc(r, s) = (r &gt;= s) ? 0 : r+1;\n    dec(r, s) = (r==0 || r &gt; s)  ? s : r-1;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red.shared.add.u64</span></code> introduced in PTX ISA 2.0.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">red.{and,or,xor,min,max}</span></code> introduced in PTX ISA 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f64</span></code> introduced in PTX ISA 5.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier introduced in PTX ISA 5.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier introduced in PTX ISA version 6.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.f16x2</span></code> introduced in PTX ISA 6.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.f16</span></code> introduced in PTX ISA 6.3.</p>\n<p>Per-element atomicity of <code class=\"docutils literal notranslate\"><span class=\"pre\">red.f16x2</span></code> clarified in PTX ISA version 6.3, with retrospective effect\nfrom PTX ISA version 6.2</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier introduced in PTX ISA version 7.4.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.bf16x2</span></code> introduced in PTX ISA 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p>Support for vector types introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.global</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code> or higher</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.shared</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.global.add.u64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.shared.add.u64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">red.{and,or,xor,min,max}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.sem</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Use of generic addressing requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.f16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.ftz.f16</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">red.add.noftz.bf16x2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Support for vector types requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>red.global.add.s32  [a],1;\nred.shared::cluster.max.u32  [x+4],0;\n@p  red.global.and.b32  [p],my_val;\nred.global.sys.add.u32 [a], 1;\nred.global.acquire.sys.add.u32 [gbl], 1;\nred.add.noftz.f16x2 [a], b;\nred.add.noftz.bf16   [a], hb;\nred.add.noftz.bf16x2 [b], bb;\nred.global.cluster.relaxed.add.u32 [a], 1;\nred.shared::cta.min.u32  [x+4],0;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\nred.global.and.L2::cache_hint.b32 [a], 1, cache-policy;\n\nred.global.v8.f16.add.noftz  [gbl], {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\nred.global.v8.bf16.min.noftz [gbl], {%h0, %h1, %h2, %h3, %h4, %h5, %h6, %h7};\nred.global.v2.f16.add.noftz [gbl], {%h0, %h1};\nred.global.v2.bf16.add.noftz [gbl], {%h0, %h1};\nred.global.v4.f16x2.max.noftz [gbl], {%h0, %h1, %h2, %h3};\nred.global.v4.f32.add  [gbl], {%f0, %f1, %f2, %f3};\nred.global.v2.f16x2.max.noftz {%bd0, %bd1}, [g], {%b0, %b1};\nred.global.v2.bf16x2.add.noftz {%bd0, %bd1}, [g], {%b0, %b1};\nred.global.v2.f32.add  {%f0, %f1}, [g], {%f0, %f1};\n</pre></div>\n</div>\n</section>\n<h1>Parallel Synchronization and Communication Instructions: red.async</h1><section id=\"parallel-synchronization-and-communication-instructions-red-async\">\n\n\n<p>Asynchronous reduction operation on shared memory.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// Increment and Decrement reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   =                 { .shared::cluster };\n.op   =                 { .inc, .dec };\n.type =                 { .u32 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n\n// MIN and MAX reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   = { .shared::cluster };\n.op   = { .min, .max };\n.type = { .u32, .s32 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n// Bitwise AND, OR and XOR reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   = { .shared::cluster };\n.op   = { .and, .or, .xor };\n.type = { .b32 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n// ADD reductions\nred.async.relaxed.cluster{.ss}.completion_mechanism.add.type [a], b, [mbar];\n\n.ss   = { .shared::cluster };\n.type = { .u32, .s32, .u64 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">red.async</span></code> is a non-blocking instruction which initiates an asynchronous reduction operation\nspecified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code>, with the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and the value at destination shared memory location\nspecified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> operations return a result in the range <code class=\"docutils literal notranslate\"><span class=\"pre\">[0..b]</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies that upon completion of the asynchronous operation,\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data stored in bytes, will be\nperformed on the <em>mbarrier object</em> specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> represents destination address and must be a register or of the form <code class=\"docutils literal notranslate\"><span class=\"pre\">register</span> <span class=\"pre\">+</span>\n<span class=\"pre\">immOff</span></code> as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>The shared memory addresses of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and the <em>mbarrier object</em> <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>, must\nmeet all of the following conditions:</p>\n<ul class=\"simple\">\n<li><p>They Belong to the same CTA.</p></li>\n<li><p>They are different to the CTA of the executing thread but must be within the same cluster.</p></li>\n</ul>\n<p>Otherwise, the behavior is undefined.</p>\n<p>The state space of the address <code class=\"docutils literal notranslate\"><span class=\"pre\">{.ss}</span></code>, if specified, is applicable to both operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>. If not specified, then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is used for\nboth <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>With <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code>, if the addresses specified do not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space, then the behaviour is undefined.</p>\n<p>The reduce operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">red.async</span></code> is treated as a relaxed memory operation and the <em>complete_tx</em>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the .cluster scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [addr], b, [mbar_addr];\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Parallel Synchronization and Communication Instructions: red\n\n\n\nReduction operations on global and shared memory.\n\nSyntax\n\nReduction operation with scalar type:\n\nred{.sem}{.scope}{.space}.op{.level::cache_hint}.type          [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16    [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::cache_hint}.f16x2  [a], b{, cache-policy};\n\nred{.sem}{.scope}{.space}.add.noftz{.level::...\n\n=====Parallel Synchronization and Communication Instructions: red.async\n\n\n\nAsynchronous reduction operation on shared memory.\n\nSyntax\n\n// Increment and Decrement reductions\n\nred.async.relaxed.cluster{.ss}.completion_mechanism.op.type [a], b, [mbar];\n\n.ss   =                 { .shared::cluster };\n\n.op   =                 { .inc, .dec };\n\n.type =                 { .u32 };\n\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n\n// MIN and MAX reductions\n\nred.async.relaxed.clust... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red"
            };

        case "redux":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync\" target=\"_blank\" rel=\"noopener noreferrer\">redux.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: redux.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-redux-sync\">\n\n\n<p>Perform reduction operation on the data from each predicated active thread in the thread group.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>redux.sync.op.type dst, src, membermask;\n.op   = {.add, .min, .max}\n.type = {.u32, .s32}\n\nredux.sync.op.b32 dst, src, membermask;\n.op   = {.and, .or, .xor}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">redux.sync</span></code> will cause the executing thread to wait until all non-exited threads corresponding to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have executed <code class=\"docutils literal notranslate\"><span class=\"pre\">redux.sync</span></code> with the same qualifiers and same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value\nbefore resuming execution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">redux.sync</span></code> performs a reduction operation <code class=\"docutils literal notranslate\"><span class=\"pre\">.op</span></code> of the 32 bit source register <code class=\"docutils literal notranslate\"><span class=\"pre\">src</span></code> across\nall non-exited threads in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. The result of the reduction operation is written to\nthe 32 bit destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">dst</span></code>.</p>\n<p>Reduction operation can be one of the bitwise operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">.and</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.or</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.xor</span></code> or arithmetic\noperation in <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code> , <code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code>.</p>\n<p>For the <code class=\"docutils literal notranslate\"><span class=\"pre\">.add</span></code> operation result is truncated to 32 bits.</p>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">redux.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Release Notes</strong></p>\n<p>Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">redux.sync</span></code> applies to threads in a single warp, not across an entire CTA.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 dst, src, init, mask;\nredux.sync.add.s32 dst, src, 0xff;\nredux.sync.xor.b32 dst, src, mask;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Perform reduction operation on the data from each predicated active thread in the thread group.\n\nSyntax\n\nredux.sync.op.type dst, src, membermask;\n\n.op   = {.add, .min, .max}\n\n.type = {.u32, .s32}\n\nredux.sync.op.b32 dst, src, membermask;\n\n.op   = {.and, .or, .xor}\n\nDescription\n\nredux.sync will cause the executing thread to wait until all non-exited threads corresponding to\n\nmembermask have executed redux.sync with the same qualifiers and same membermask value\n\nbefore resuming execution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid.\n\nredux.sync performs a reduction operation .op of the 32 bit source register src across\n\nall non-exited threads in the membermask. The result of the reduction operation is written to\n\nthe 32 bit destination register dst.\n\nReduction operation can be one of the bitwise operation in .and, .or, .xor or arithmetic\n\noperation in .add, .min , .max.\n\nFor the .add operation result is truncated to 32 bits.\n\nThe behavior of redux.sync is undefined if the executing thread is not in the membermask.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.0.\n\nTarget ISA Notes\n\nRequires sm_80 or higher.\n\nRelease Notes\n\nNote that redux.sync applies to threads in a single warp, not across an entire CTA.\n\nExamples\n\n.reg .b32 dst, src, init, mask;\n\nredux.sync.add.s32 dst, src, 0xff;\n\nredux.sync.xor.b32 dst, src, mask;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync"
            };

        case "rem":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem\" target=\"_blank\" rel=\"noopener noreferrer\">rem(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: rem</h1><section id=\"integer-arithmetic-instructions-rem\">\n\n\n<p>The remainder of integer division.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rem.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Divides <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> by <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>, store the remainder in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a % b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The behavior for negative numbers is machine-dependent and depends on whether divide rounds towards\nzero or negative infinity.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rem.s32  x,x,8;    // x = x%8;\n</pre></div>\n</div>\n</section>",
                "tooltip": "The remainder of integer division.\n\nSyntax\n\nrem.type  d, a, b;\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nDivides a by b, store the remainder in d.\n\nSemantics\n\nd = a % b;\n\nNotes\n\nThe behavior for negative numbers is machine-dependent and depends on whether divide rounds towards\n\nzero or negative infinity.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nrem.s32  x,x,8;    // x = x%8;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem"
            };

        case "reqnctapercluster":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-reqnctapercluster\" target=\"_blank\" rel=\"noopener noreferrer\">reqnctapercluster <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Cluster Dimension Directives: .reqnctapercluster</h1><section id=\"cluster-dimension-directives-reqnctapercluster\">\n\n\n<p>Declare the number of CTAs in the cluster.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reqnctapercluster nx\n.reqnctapercluster nx, ny\n.reqnctapercluster nx, ny, nz\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Set the number of thread blocks (CTAs) in the cluster by specifying the extent of each dimension of\nthe 1D, 2D, or 3D cluster. The total number of CTAs is the product of the number of CTAs in each\ndimension. For kernels with <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqnctapercluster</span></code> directive specified, runtime will use the\nspecified values for configuring the launch if the same are not specified at launch time.</p>\n<p><strong>Semantics</strong></p>\n<p>If cluster dimension is explicitly specified at launch time, it should be equal to the values\nspecified in this directive. Specifying a different cluster dimension at launch will result in a\nruntime error or kernel launch failure.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo .reqnctapercluster 2         { . . . }\n.entry bar .reqnctapercluster 2, 2, 1   { . . . }\n.entry ker .reqnctapercluster 3, 2      { . . . }\n</pre></div>\n</div>\n</section>",
                "tooltip": "Declare the number of CTAs in the cluster.\n\nSyntax\n\n.reqnctapercluster nx\n\n.reqnctapercluster nx, ny\n\n.reqnctapercluster nx, ny, nz\n\nDescription\n\nSet the number of thread blocks (CTAs) in the cluster by specifying the extent of each dimension of\n\nthe 1D, 2D, or 3D cluster. The total number of CTAs is the product of the number of CTAs in each\n\ndimension. For kernels with .reqnctapercluster directive specified, runtime will use the\n\nspecified values for configuring the launch if the same are not specified at launch time.\n\nSemantics\n\nIf cluster dimension is explicitly specified at launch time, it should be equal to the values\n\nspecified in this directive. Specifying a different cluster dimension at launch will result in a\n\nruntime error or kernel launch failure.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.8.\n\nTarget ISA Notes\n\nRequires sm_90 or higher.\n\nExamples\n\n.entry foo .reqnctapercluster 2         { . . . }\n\n.entry bar .reqnctapercluster 2, 2, 1   { . . . }\n\n.entry ker .reqnctapercluster 3, 2      { . . . }\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-reqnctapercluster"
            };

        case "reqntid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-reqntid\" target=\"_blank\" rel=\"noopener noreferrer\">reqntid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Performance-Tuning Directives: .reqntid</h1><section id=\"performance-tuning-directives-reqntid\">\n\n\n<p>Number of threads in the thread block (CTA).</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reqntid nx\n.reqntid nx, ny\n.reqntid nx, ny, nz\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declare the number of threads in the thread block (CTA) by specifying the extent of each dimension\nof the 1D, 2D, or 3D CTA. The total number of threads is the product of the number of threads in\neach dimension.</p>\n<p><strong>Semantics</strong></p>\n<p>The size of each CTA dimension specified in any invocation of the kernel is required to be equal to\nthat specified in this directive. Specifying a different CTA dimension at launch will result in a\nruntime error or kernel launch failure.</p>\n<p><strong>Notes</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.reqntid</span></code> directive cannot be used in conjunction with the <code class=\"docutils literal notranslate\"><span class=\"pre\">.maxntid</span></code> directive.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.entry foo .reqntid 256       { ... }  // num threads = 256\n.entry bar .reqntid 16,16,4   { ... }  // num threads = 1024\n</pre></div>\n</div>\n</section>",
                "tooltip": "Number of threads in the thread block (CTA).\n\nSyntax\n\n.reqntid nx\n\n.reqntid nx, ny\n\n.reqntid nx, ny, nz\n\nDescription\n\nDeclare the number of threads in the thread block (CTA) by specifying the extent of each dimension\n\nof the 1D, 2D, or 3D CTA. The total number of threads is the product of the number of threads in\n\neach dimension.\n\nSemantics\n\nThe size of each CTA dimension specified in any invocation of the kernel is required to be equal to\n\nthat specified in this directive. Specifying a different CTA dimension at launch will result in a\n\nruntime error or kernel launch failure.\n\nNotes\n\nThe .reqntid directive cannot be used in conjunction with the .maxntid directive.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.entry foo .reqntid 256       { ... }  // num threads = 256\n\n.entry bar .reqntid 16,16,4   { ... }  // num threads = 1024\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives-reqntid"
            };

        case "reserved_smem_offset_<2>":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\" target=\"_blank\" rel=\"noopener noreferrer\">reserved_smem_offset_<2> <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2></h1><section id=\"special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\">\n<span id=\"special-registers-reserved-smem\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_begin</span></code></dt><dd><p>Start of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_end</span></code></dt><dd><p>End of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_cap</span></code></dt><dd><p>Total size of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_&lt;2&gt;</span></code></dt><dd><p>Offsets in the reserved shared memory region.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_&lt;2&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>These are predefined, read-only special registers containing information about the shared memory\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n<em>CUDA Programming Guide</em> for details.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\n\nmov.b32 %reg_end,     %reserved_smem_offset_end;\n\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2"
            };

        case "reserved_smem_offset_begin":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\" target=\"_blank\" rel=\"noopener noreferrer\">reserved_smem_offset_begin <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2></h1><section id=\"special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\">\n<span id=\"special-registers-reserved-smem\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_begin</span></code></dt><dd><p>Start of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_end</span></code></dt><dd><p>End of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_cap</span></code></dt><dd><p>Total size of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_&lt;2&gt;</span></code></dt><dd><p>Offsets in the reserved shared memory region.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_&lt;2&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>These are predefined, read-only special registers containing information about the shared memory\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n<em>CUDA Programming Guide</em> for details.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\n\nmov.b32 %reg_end,     %reserved_smem_offset_end;\n\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2"
            };

        case "reserved_smem_offset_cap":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\" target=\"_blank\" rel=\"noopener noreferrer\">reserved_smem_offset_cap <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2></h1><section id=\"special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\">\n<span id=\"special-registers-reserved-smem\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_begin</span></code></dt><dd><p>Start of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_end</span></code></dt><dd><p>End of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_cap</span></code></dt><dd><p>Total size of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_&lt;2&gt;</span></code></dt><dd><p>Offsets in the reserved shared memory region.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_&lt;2&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>These are predefined, read-only special registers containing information about the shared memory\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n<em>CUDA Programming Guide</em> for details.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\n\nmov.b32 %reg_end,     %reserved_smem_offset_end;\n\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2"
            };

        case "reserved_smem_offset_end":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\" target=\"_blank\" rel=\"noopener noreferrer\">reserved_smem_offset_end <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %reserved_smem_offset_begin, %reserved_smem_offset_end, %reserved_smem_offset_cap, %reserved_smem_offset_<2></h1><section id=\"special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2\">\n<span id=\"special-registers-reserved-smem\"></span>\n\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_begin</span></code></dt><dd><p>Start of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_end</span></code></dt><dd><p>End of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_cap</span></code></dt><dd><p>Total size of the reserved shared memory region.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">%reserved_smem_offset_&lt;2&gt;</span></code></dt><dd><p>Offsets in the reserved shared memory region.</p>\n</dd>\n</dl>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .b32 %reserved_smem_offset_begin;\n.sreg .b32 %reserved_smem_offset_end;\n.sreg .b32 %reserved_smem_offset_cap;\n.sreg .b32 %reserved_smem_offset_&lt;2&gt;;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>These are predefined, read-only special registers containing information about the shared memory\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n<em>CUDA Programming Guide</em> for details.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\nmov.b32 %reg_end,     %reserved_smem_offset_end;\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "%reserved_smem_offset_beginStart of the reserved shared memory region.\n\n%reserved_smem_offset_endEnd of the reserved shared memory region.\n\n%reserved_smem_offset_capTotal size of the reserved shared memory region.\n\n%reserved_smem_offset_<2>Offsets in the reserved shared memory region.\n\nSyntax (predefined)\n\n.sreg .b32 %reserved_smem_offset_begin;\n\n.sreg .b32 %reserved_smem_offset_end;\n\n.sreg .b32 %reserved_smem_offset_cap;\n\n.sreg .b32 %reserved_smem_offset_<2>;\n\nDescription\n\nThese are predefined, read-only special registers containing information about the shared memory\n\nregion which is reserved for the NVIDIA system software use. This region of shared memory is not\n\navailable to users, and accessing this region from user code results in undefined behavior. Refer to\n\nCUDA Programming Guide for details.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nRequire sm_80 or higher.\n\nExamples\n\n.reg .b32 %reg_begin, %reg_end, %reg_cap, %reg_offset0, %reg_offset1;\n\nmov.b32 %reg_begin,   %reserved_smem_offset_begin;\n\nmov.b32 %reg_end,     %reserved_smem_offset_end;\n\nmov.b32 %reg_cap,     %reserved_smem_offset_cap;\n\nmov.b32 %reg_offset0, %reserved_smem_offset_0;\n\nmov.b32 %reg_offset1, %reserved_smem_offset_1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-reserved-smem-offset-begin-reserved-smem-offset-end-reserved-smem-offset-cap-reserved-smem-offset-2"
            };

        case "ret":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret\" target=\"_blank\" rel=\"noopener noreferrer\">ret <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Control Flow Instructions: ret</h1><section id=\"control-flow-instructions-ret\">\n\n\n<p>Return from function to instruction after call.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>ret{.uni};\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Return execution to caller\u2019s environment. A divergent return suspends threads until all threads are\nready to return to the caller. This allows multiple divergent <code class=\"docutils literal notranslate\"><span class=\"pre\">ret</span></code> instructions.</p>\n<p>A <code class=\"docutils literal notranslate\"><span class=\"pre\">ret</span></code> is assumed to be divergent unless the <code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code> suffix is present, indicating that the\nreturn is guaranteed to be non-divergent.</p>\n<p>Any values returned from a function should be moved into the return parameter variables prior to\nexecuting the <code class=\"docutils literal notranslate\"><span class=\"pre\">ret</span></code> instruction.</p>\n<p>A return instruction executed in a top-level entry routine will terminate thread execution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    ret;\n@p  ret;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Return from function to instruction after call.\n\nSyntax\n\nret{.uni};\n\nDescription\n\nReturn execution to caller\u2019s environment. A divergent return suspends threads until all threads are\n\nready to return to the caller. This allows multiple divergent ret instructions.\n\nA ret is assumed to be divergent unless the .uni suffix is present, indicating that the\n\nreturn is guaranteed to be non-divergent.\n\nAny values returned from a function should be moved into the return parameter variables prior to\n\nexecuting the ret instruction.\n\nA return instruction executed in a top-level entry routine will terminate thread execution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n    ret;\n\n@p  ret;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret"
            };

        case "rsqrt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt\" target=\"_blank\" rel=\"noopener noreferrer\">rsqrt(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64\" target=\"_blank\" rel=\"noopener noreferrer\">rsqrt.approx.ftz.f64(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: rsqrt</h1><section id=\"floating-point-instructions-rsqrt\">\n\n\n<p>Take the reciprocal of the square root of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rsqrt.approx{.ftz}.f32  d, a;\nrsqrt.approx.f64        d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute <code class=\"docutils literal notranslate\"><span class=\"pre\">1/sqrt(a)</span></code> and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = 1/sqrt(a);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx</span></code> implements an approximation to the reciprocal square root.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-normal</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+0.0</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+subnormal</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error for <code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f32</span></code> is 2<sup>-22.4</sup> over the range 1.0-4.0.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx.f64</span></code> is emulated in software and are relatively slow.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f64</span></code> were introduced in PTX ISA version 1.0. Explicit modifiers\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> were introduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx.ftz.f32</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f64</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx.f64</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rsqrt.approx.ftz.f32  isr, x;\nrsqrt.approx.f64      ISR, X;\n</pre></div>\n</div>\n</section>\n<h1>Floating Point Instructions: rsqrt.approx.ftz.f64</h1><section id=\"floating-point-instructions-rsqrt-approx-ftz-f64\">\n\n\n<p>Compute an approximation of the square root reciprocal of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rsqrt.approx.ftz.f64 d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute a double-precision (<code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>) approximation of the square root reciprocal of a value. The\nleast significant 32 bits of the double-precision (<code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code>) destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> are all zeros.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tmp = a[63:32]; // upper word of a, 1.11.20 format\nd[63:32] = 1.0 / sqrt(tmp);\nd[31:0] = 0x00000000;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx.ftz.f64</span></code> implements a fast approximation of the square root reciprocal of a value.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>-Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>Input <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>s map to a canonical <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> with encoding <code class=\"docutils literal notranslate\"><span class=\"pre\">0x7fffffff00000000</span></code>.</p>\n<p>Subnormal inputs and results are flushed to sign-preserving zero.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx.ftz.f64</span></code> introduced in PTX ISA version 4.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">rsqrt.approx.ftz.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>rsqrt.approx.ftz.f64 xi,x;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: rsqrt\n\n\n\nTake the reciprocal of the square root of a value.\n\nSyntax\n\nrsqrt.approx{.ftz}.f32  d, a;\n\nrsqrt.approx.f64        d, a;\n\nDescription\n\nCompute 1/sqrt(a) and store the result in d.\n\nSemantics\n\nd = 1/sqrt(a);\n\nNotes\n\nrsqrt.approx implements an approximation to the reciprocal square root.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-normal\n\nNaN\n\n-subnormal\n\n-Inf\n\n-0.0\n\n-Inf\n\n+0.0\n\n+Inf\n\n+subnormal\n\n+Inf\n\n+Inf\n\n+0.0\n\nNaN\n\nNaN\n\nThe maximum absol...\n\n=====Floating Point Instructions: rsqrt.approx.ftz.f64\n\n\n\nCompute an approximation of the square root reciprocal of a value.\n\nSyntax\n\nrsqrt.approx.ftz.f64 d, a;\n\nDescription\n\nCompute a double-precision (.f64) approximation of the square root reciprocal of a value. The\n\nleast significant 32 bits of the double-precision (.f64) destination d are all zeros.\n\nSemantics\n\ntmp = a[63:32]; // upper word of a, 1.11.20 format\n\nd[63:32] = 1.0 / sqrt(tmp);\n\nd[31:0] = 0x000000... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt"
            };

        case "sad":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad\" target=\"_blank\" rel=\"noopener noreferrer\">sad(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: sad</h1><section id=\"integer-arithmetic-instructions-sad\">\n\n\n<p>Sum of absolute differences.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sad.type  d, a, b, c;\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Adds the absolute value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a-b</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> and writes the resulting value into <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = c + ((a&lt;b) ? b-a : a-b);\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sad.s32  d,a,b,c;\nsad.u32  d,a,b,d;  // running sum\n</pre></div>\n</div>\n</section>",
                "tooltip": "Sum of absolute differences.\n\nSyntax\n\nsad.type  d, a, b, c;\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nAdds the absolute value of a-b to c and writes the resulting value into d.\n\nSemantics\n\nd = c + ((a<b) ? b-a : a-b);\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nsad.s32  d,a,b,c;\n\nsad.u32  d,a,b,d;  // running sum\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad"
            };

        case "section":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-section\" target=\"_blank\" rel=\"noopener noreferrer\">section <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Debugging Directives: .section</h1><section id=\"debugging-directives-section\">\n\n\n<p>PTX section definition.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.section section_name { dwarf-lines }\n\ndwarf-lines have the following formats:\n  .b8    byte-list       // comma-separated list of integers\n                         // in range [-128..255]\n  .b16   int16-list      // comma-separated list of integers\n                         // in range [-2^15..2^16-1]\n  .b32   int32-list      // comma-separated list of integers\n                         // in range [-2^31..2^32-1]\n  label:                 // Define label inside the debug section\n  .b64   int64-list      // comma-separated list of integers\n                         // in range [-2^63..2^64-1]\n  .b32   label\n  .b64   label\n  .b32   label+imm       // a sum of label address plus a constant integer byte\n                         // offset(signed, 32bit)\n  .b64   label+imm       // a sum of label address plus a constant integer byte\n                         // offset(signed, 64bit)\n  .b32   label1-label2   // a difference in label addresses between labels in\n                         // the same dwarf section (32bit)\n  .b64   label3-label4   // a difference in label addresses between labels in\n                         // the same dwarf section (64bit)\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0, replaces <code class=\"docutils literal notranslate\"><span class=\"pre\">@@DWARF</span></code> syntax.</p>\n<p>label+imm expression introduced in PTX ISA version 3.2.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> integers in dwarf-lines introduced in PTX ISA version 6.0.</p>\n<p>Support for defining <code class=\"docutils literal notranslate\"><span class=\"pre\">label</span></code> inside the DWARF section is introduced in PTX ISA version 7.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">label1-label2</span></code> expression introduced in PTX ISA version 7.5.</p>\n<p>Negative numbers in dwarf lines introduced in PTX ISA version 7.5.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.section .debug_pubnames\n{\n    .b32    LpubNames_end0-LpubNames_begin0\n  LpubNames_begin0:\n    .b8     0x2b, 0x00, 0x00, 0x00, 0x02, 0x00\n    .b32    .debug_info\n  info_label1:\n    .b32    0x000006b5, 0x00000364, 0x61395a5f, 0x5f736f63\n    .b32    0x6e69616d, 0x63613031, 0x6150736f, 0x736d6172\n    .b8     0x00, 0x00, 0x00, 0x00, 0x00\n  LpubNames_end0:\n}\n\n.section .debug_info\n{\n    .b32 11430\n    .b8 2, 0\n    .b32 .debug_abbrev\n    .b8 8, 1, 108, 103, 101, 110, 102, 101, 58, 32, 69, 68, 71, 32, 52, 46, 49\n    .b8 0\n    .b32 3, 37, 176, -99\n    .b32 info_label1\n    .b32 .debug_loc+0x4\n    .b8 -11, 11, 112, 97\n    .b32 info_label1+12\n    .b64 -1\n    .b16 -5, -65535\n}\n</pre></div>\n</div>\n</section>",
                "tooltip": "PTX section definition.\n\nSyntax\n\n.section section_name { dwarf-lines }\n\ndwarf-lines have the following formats:\n\n  .b8    byte-list       // comma-separated list of integers\n\n                         // in range [-128..255]\n\n  .b16   int16-list      // comma-separated list of integers\n\n                         // in range [-2^15..2^16-1]\n\n  .b32   int32-list      // comma-separated list of integers\n\n                         // in range [-2^31..2^32-1]\n\n  label:                 // Define label inside the debug section\n\n  .b64   int64-list      // comma-separated list of integers\n\n                         // in range [-2^63..2^64-1]\n\n  .b32   label\n\n  .b64   label\n\n  .b32   label+imm       // a sum of label address plus a constant integer byte\n\n                         // offset(signed, 32bit)\n\n  .b64   label+imm       // a sum of label address plus a constant integer byte\n\n                         // offset(signed, 64bit)\n\n  .b32   label1-label2   // a difference in label addresses between labels in\n\n                         // the same dwarf section (32bit)\n\n  .b64   label3-label4   // a difference in label addresses between labels in\n\n                         // the same dwarf section (64bit)\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0, replaces @@DWARF syntax.\n\nlabel+imm expression introduced in PTX ISA version 3.2.\n\nSupport for .b16 integers in dwarf-lines introduced in PTX ISA version 6.0.\n\nSupport for defining label inside the DWARF section is introduced in PTX ISA version 7.2.\n\nlabel1-label2 expression introduced in PTX ISA version 7.5.\n\nNegative numbers in dwarf lines introduced in PTX ISA version 7.5.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.section .debug_pubnames\n\n{\n\n    .b32    LpubNames_end0-LpubNames_begin0\n\n  LpubNames_begin0:\n\n    .b8     0x2b, 0x00, 0x00, 0x00, 0x02, 0x00\n\n    .b32    .debug_info\n\n  info_label1:\n\n    .b32    0x000006b5, 0x00000364, 0x61395a5f, 0x5f736f63\n\n    .b32    0x6e69616d, 0x63613031, 0x6150736f, 0x736d6172\n\n    .b8     0x00, 0x00, 0x00, 0x00, 0x00\n\n  LpubNames_end0:\n\n}\n\n.section .debug_info\n\n{\n\n    .b32 11430\n\n    .b8 2, 0\n\n    .b32 .debug_abbrev\n\n    .b8 8, 1, 108, 103, 101, 110, 102, 101, 58, 32, 69, 68, 71, 32, 52, 46, 49\n\n    .b8 0\n\n    .b32 3, 37, 176, -99\n\n    .b32 info_label1\n\n    .b32 .debug_loc+0x4\n\n    .b8 -11, 11, 112, 97\n\n    .b32 info_label1+12\n\n    .b64 -1\n\n    .b16 -5, -65535\n\n}\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-section"
            };

        case "selp":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp\" target=\"_blank\" rel=\"noopener noreferrer\">selp <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Comparison and Selection Instructions: selp</h1><section id=\"comparison-and-selection-instructions-selp\">\n\n\n<p>Select between source operands, based on the value of the predicate source operand.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>selp.type d, a, b, c;\n\n.type = { .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64,\n                .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Conditional selection. If <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> otherwise. Operands\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> must be of the same type. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is a predicate.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = (c == 1) ? a : b;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">selp.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    selp.s32  r0,r,g,p;\n@q  selp.f32  f0,t,x,xp;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Select between source operands, based on the value of the predicate source operand.\n\nSyntax\n\nselp.type d, a, b, c;\n\n.type = { .b16, .b32, .b64,\n\n          .u16, .u32, .u64,\n\n          .s16, .s32, .s64,\n\n                .f32, .f64 };\n\nDescription\n\nConditional selection. If c is True, a is stored in d, b otherwise. Operands\n\nd, a, and b must be of the same type. Operand c is a predicate.\n\nSemantics\n\nd = (c == 1) ? a : b;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nselp.f64 requires sm_13 or higher.\n\nExamples\n\n    selp.s32  r0,r,g,p;\n\n@q  selp.f32  f0,t,x,xp;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp"
            };

        case "set":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set\" target=\"_blank\" rel=\"noopener noreferrer\">set <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-set\" target=\"_blank\" rel=\"noopener noreferrer\">set <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Comparison and Selection Instructions: set</h1><section id=\"comparison-and-selection-instructions-set\">\n\n\n<p>Compare two numeric values with a relational operator, and optionally combine this result with a\npredicate value by applying a Boolean operator.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>set.CmpOp{.ftz}.dtype.stype         d, a, b;\nset.CmpOp.BoolOp{.ftz}.dtype.stype  d, a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge, lo, ls, hi, hs,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n.dtype  = { .u32, .s32, .f32 };\n.stype  = { .b16, .b32, .b64,\n            .u16, .u32, .u64,\n            .s16, .s32, .s64,\n                  .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compares two numeric values and optionally combines the result with another predicate value by\napplying a Boolean operator. If this result is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">1.0f</span></code> is written for floating-point\ndestination types, and <code class=\"docutils literal notranslate\"><span class=\"pre\">0xffffffff</span></code> is written for integer destination types. Otherwise,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">0x00000000</span></code> is written.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.dtype</span></code>; operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.stype</span></code>; operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = (a CmpOp b) ? 1 : 0;\nif (isFloat(dtype))\n    d = BoolOp(t, c) ? 1.0f : 0x00000000;\nelse\n    d = BoolOp(t, c) ? 0xffffffff : 0x00000000;\n</pre></div>\n</div>\n<p><strong>Integer Notes</strong></p>\n<p>The signed and unsigned comparison operators are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>.</p>\n<p>For unsigned values, the comparison operators <code class=\"docutils literal notranslate\"><span class=\"pre\">lo</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ls</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">hi</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">hs</span></code> for lower,\nlower-or-same, higher, and higher-or-same may be used instead of <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>,\nrespectively.</p>\n<p>The untyped, bit-size comparisons are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>.</p>\n<p><strong>Floating Point Notes</strong></p>\n<p>The ordered comparisons are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>. If either operand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>To aid comparison operations in the presence of <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> values, unordered versions are included:\n<code class=\"docutils literal notranslate\"><span class=\"pre\">equ</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">neu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ltu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">leu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gtu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">geu</span></code>. If both operands are numeric values (not\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), then these comparisons have the same result as their ordered counterparts. If either\noperand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, then the result of these comparisons is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">num</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if both operands are numeric values (not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), and <code class=\"docutils literal notranslate\"><span class=\"pre\">nan</span></code> returns\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if either operand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">set.ftz.dtype.f32</span></code> flushes subnormal inputs to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">set.dtype.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">set.dtype.f32</span></code> flushes subnormal inputs to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> applies only to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> comparisons.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">set</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> source type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  set.lt.and.f32.s32  d,a,b,r;\n    set.eq.u32.u32      d,i,n;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Comparison Instructions: set</h1><section id=\"half-precision-comparison-instructions-set\">\n\n\n<p>Compare two numeric values with a relational operator, and optionally combine this result with a\npredicate value by applying a Boolean operator.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>set.CmpOp{.ftz}.f16.stype            d, a, b;\nset.CmpOp.BoolOp{.ftz}.f16.stype     d, a, b, {!}c;\n\nset.CmpOp.bf16.stype                 d, a, b;\nset.CmpOp.BoolOp.bf16.stype          d, a, b, {!}c;\n\nset.CmpOp{.ftz}.dtype.f16            d, a, b;\nset.CmpOp.BoolOp{.ftz}.dtype.f16     d, a, b, {!}c;\n.dtype  = { .u16, .s16, .u32, .s32}\n\nset.CmpOp.dtype.bf16                 d, a, b;\nset.CmpOp.BoolOp.dtype.bf16          d, a, b, {!}c;\n.dtype  = { .u16, .s16, .u32, .s32}\n\nset.CmpOp{.ftz}.dtype.f16x2          d, a, b;\nset.CmpOp.BoolOp{.ftz}.dtype.f16x2   d, a, b, {!}c;\n.dtype  = { .f16x2, .u32, .s32}\n\nset.CmpOp.dtype.bf16x2               d, a, b;\nset.CmpOp.BoolOp.dtype.bf16x2        d, a, b, {!}c;\n.dtype  = { .bf16x2, .u32, .s32}\n\n.CmpOp  = { eq, ne, lt, le, gt, ge,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n.stype  = { .b16, .b32, .b64,\n            .u16, .u32, .u64,\n            .s16, .s32, .s64,\n            .f16, .f32, .f64};\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compares two numeric values and optionally combines the result with another predicate value by\napplying a Boolean operator.</p>\n<p>Result of this computation is written in destination register in the following way:</p>\n<ul class=\"simple\">\n<li><p>If result is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>,</p>\n<ul>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">0xffffffff</span></code> is written for destination types <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">0xffff</span></code> is written for destination types <code class=\"docutils literal notranslate\"><span class=\"pre\">.u16</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s16</span></code>.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">1.0</span></code> in target precision floating point format is written for destination type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>.</p></li>\n</ul>\n</li>\n<li><p>If result is <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>,</p>\n<ul>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">0x0</span></code> is written for all integer destination types.</p></li>\n<li><p><code class=\"docutils literal notranslate\"><span class=\"pre\">0.0</span></code> in target precision floating point format is written for destination type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>.</p></li>\n</ul>\n</li>\n</ul>\n<p>If the source type is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> then result of individual operations are packed in\nthe 32-bit destination operand.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (stype == .f16x2 || stype == .bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    t[0]   = (fA[0] CmpOp fB[0]) ? 1 : 0;\n    t[1]   = (fA[1] CmpOp fB[1]) ? 1 : 0;\n    if (dtype == .f16x2 || stype == .bf16x2) {\n        for (i = 0; i &lt; 2; i++) {\n            d[i] = BoolOp(t[i], c) ? 1.0 : 0.0;\n        }\n    } else {\n        for (i = 0; i &lt; 2; i++) {\n            d[i] = BoolOp(t[i], c) ? 0xffff : 0;\n        }\n    }\n} else if (dtype == .f16 || stype == .bf16) {\n    t = (a CmpOp b) ? 1 : 0;\n    d = BoolOp(t, c) ? 1.0 : 0.0;\n} else  { // Integer destination type\n    trueVal = (isU16(dtype) || isS16(dtype)) ?  0xffff : 0xffffffff;\n    t = (a CmpOp b) ? 1 : 0;\n    d = BoolOp(t, c) ? trueVal : 0;\n}\n</pre></div>\n</div>\n<p><strong>Floating Point Notes</strong></p>\n<p>The ordered comparisons are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>. If either operand is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>To aid comparison operations in the presence of <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> values, unordered versions are included:\n<code class=\"docutils literal notranslate\"><span class=\"pre\">equ</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">neu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ltu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">leu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gtu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">geu</span></code>. If both operands are numeric values (not\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), then these comparisons have the same result as their ordered counterparts. If either\noperand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, then the result of these comparisons is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">num</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if both operands are numeric values (not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), and <code class=\"docutils literal notranslate\"><span class=\"pre\">nan</span></code> returns\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if either operand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<dl>\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> modifier is specified then subnormal inputs and results are flushed to sign\npreserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">set.{u16,</span> <span class=\"pre\">u32,</span> <span class=\"pre\">s16,</span> <span class=\"pre\">s32}.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">set.{u32,</span> <span class=\"pre\">s32}.f16x2</span></code> are introduced in PTX ISA version 6.5.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">set.{u16,</span> <span class=\"pre\">u32,</span> <span class=\"pre\">s16,</span> <span class=\"pre\">s32}.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">set.{u32,</span> <span class=\"pre\">s32,</span> <span class=\"pre\">bf16x2}.bf16x2</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">set.bf16.{s16,u16,f16,b16,s32,u32,f32,b32,s64,u64,f64,b64}</span></code> are introduced in PTX ISA version\n7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">set.{u16,</span> <span class=\"pre\">u32,</span> <span class=\"pre\">s16,</span> <span class=\"pre\">s32}.bf16</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">set.{u32,</span> <span class=\"pre\">s32,</span> <span class=\"pre\">bf16x2}.bf16x2</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">set.bf16.{s16,u16,f16,b16,s32,u32,f32,b32,s64,u64,f64,b64}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>set.lt.and.f16.f16  d,a,b,r;\nset.eq.f16x2.f16x2  d,i,n;\nset.eq.u32.f16x2    d,i,n;\nset.lt.and.u16.f16  d,a,b,r;\nset.ltu.or.bf16.f16    d,u,v,s;\nset.equ.bf16x2.bf16x2  d,j,m;\nset.geu.s32.bf16x2     d,j,m;\nset.num.xor.s32.bf16   d,u,v,s;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Compare two numeric values with a relational operator, and optionally combine this result with a\n\npredicate value by applying a Boolean operator.\n\nSyntax\n\nset.CmpOp{.ftz}. ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-set"
            };

        case "setmaxnreg":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg\" target=\"_blank\" rel=\"noopener noreferrer\">setmaxnreg <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: setmaxnreg</h1><section id=\"miscellaneous-instructions-setmaxnreg\">\n\n\n<p>Hint to change the number of registers owned by the warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>setmaxnreg.action.sync.aligned.u32 imm-reg-count;\n\n.action = { .inc, .dec };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> provides a hint to the system to update the maximum number of per-thread registers\nowned by the executing warp to the value specified by the <code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code> operand.</p>\n<p>Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> is used to release extra registers such that the absolute per-thread maximum\nregister count is reduced from its current value to <code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code>. Qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> is used to\nrequest additional registers such that the absolute per-thread maximum register count is increased\nfrom its current value to <code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code>.</p>\n<p>A pool of available registers is maintained per-CTA. Register adjustments requested by the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instructions are handled by supplying extra registers from this pool to the\nrequesting warp or by releasing extra registers from the requesting warp to this pool, depending\nupon the value of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.action</span></code> qualifier.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg.inc</span></code> instruction blocks the execution until enough registers are available in the\nCTA\u2019s register pool. After the instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg.inc</span></code> obtains new registers from the CTA\npool, the initial contents of the new registers are undefined. The new registers must be initialized\nbefore they are used.</p>\n<p>The same <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction must be executed by all warps in a <a class=\"reference external\" href=\"#asynchronous-warpgroup-level-matrix-instructions-warpgroup\">warpgroup</a>. After executing a\n<code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction, all warps in the <em>warpgroup</em> must synchronize explicitly before\nexecuting subsequent setmaxnreg instructions. If a <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction is not executed by all\nwarps in the <em>warpgroup</em>, then the behavior is undefined.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code> is an integer constant. The value of <code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code> must be in the\nrange 24 to 256 (both inclusive) and must be a multiple of 8.</p>\n<p>Changes to the register file of the warp always happen at the tail-end of the register file.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction requires that the kernel has been launched with a valid value of\nmaximum number of per-thread registers specified via the appropriate compilation via the appropriate\ncompile-time option or the appropriate performance tuning directive. Otherwise, the <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code>\ninstruction may have no effect.</p>\n<p>When qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.dec</span></code> is specified, the maximum number of per-thread registers owned by the warp\nprior to the execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction should be greater than or equal to the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code>. Otherwise, the behaviour is undefined.</p>\n<p>When qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.inc</span></code> is specified, the maximum number of per-thread registers owned by the warp\nprior to the execution of <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction should be less than or equal to the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">imm-reg-count</span></code>. Otherwise, the behaviour is undefined.</p>\n<p>The mandatory <code class=\"docutils literal notranslate\"><span class=\"pre\">.sync</span></code> qualifier indicates that <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction causes the executing\nthread to wait until all threads in the warp execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction before\nresuming execution.</p>\n<p>The mandatory <code class=\"docutils literal notranslate\"><span class=\"pre\">.aligned</span></code> qualifier indicates that all threads in the warpgroup must execute the\nsame <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction. In conditionally executed code, <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instruction should\nonly be used if it is known that all threads in warpgroup evaluate the condition identically,\notherwise the behavior is undefined.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code>.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>setmaxnreg.dec.sync.aligned.u32 64;\nsetmaxnreg.inc.sync.aligned.u32 192;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Hint to change the number of registers owned by the warp.\n\nSyntax\n\nsetmaxnreg.action.sync.aligned.u32 imm-reg-count;\n\n.action = { .inc, .dec };\n\nDescription\n\nsetmaxnreg provides a hint to the system to update the maximum number of per-thread registers\n\nowned by the executing warp to the value specified by the imm-reg-count operand.\n\nQualifier .dec is used to release extra registers such that the absolute per-thread maximum\n\nregister count is reduced from its current value to imm-reg-count. Qualifier .inc is used to\n\nrequest additional registers such that the absolute per-thread maximum register count is increased\n\nfrom its current value to imm-reg-count.\n\nA pool of available registers is maintained per-CTA. Register adjustments requested by the\n\nsetmaxnreg instructions are handled by supplying extra registers from this pool to the\n\nrequesting warp or by releasing extra registers from the requesting warp to this pool, depending\n\nupon the value of the .action qualifier.\n\nThe setmaxnreg.inc instruction blocks the execution until enough registers are available in the\n\nCTA\u2019s register pool. After the instruction setmaxnreg.inc obtains new registers from the CTA\n\npool, the initial contents of the new registers are undefined. The new registers must be initialized\n\nbefore they are used.\n\nThe same setmaxnreg instruction must be executed by all warps in a warpgroup. After executing a\n\nsetmaxnreg instruction, all warps in the warpgroup must synchronize explicitly before\n\nexecuting subsequent setmaxnreg instructions. If a setmaxnreg instruction is not executed by all\n\nwarps in the warpgroup, then the behavior is undefined.\n\nOperand imm-reg-count is an integer constant. The value of imm-reg-count must be in the\n\nrange 24 to 256 (both inclusive) and must be a multiple of 8.\n\nChanges to the register file of the warp always happen at the tail-end of the register file.\n\nThe setmaxnreg instruction requires that the kernel has been launched with a valid value of\n\nmaximum number of per-thread registers specified via the appropriate compilation via the appropriate\n\ncompile-time option or the appropriate performance tuning directive. Otherwise, the setmaxnreg\n\ninstruction may have no effect.\n\nWhen qualifier .dec is specified, the maximum number of per-thread registers owned by the warp\n\nprior to the execution of setmaxnreg instruction should be greater than or equal to the\n\nimm-reg-count. Otherwise, the behaviour is undefined.\n\nWhen qualifier .inc is specified, the maximum number of per-thread registers owned by the warp\n\nprior to the execution of setmaxnreg instruction should be less than or equal to the\n\nimm-reg-count. Otherwise, the behaviour is undefined.\n\nThe mandatory .sync qualifier indicates that setmaxnreg instruction causes the executing\n\nthread to wait until all threads in the warp execute the same setmaxnreg instruction before\n\nresuming execution.\n\nThe mandatory .aligned qualifier indicates that all threads in the warpgroup must execute the\n\nsame setmaxnreg instruction. In conditionally executed code, setmaxnreg instruction should\n\nonly be used if it is known that all threads in warpgroup evaluate the condition identically,\n\notherwise the behavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 8.0.\n\nTarget ISA Notes\n\nRequires sm_90a.\n\nExamples\n\nsetmaxnreg.dec.sync.aligned.u32 64;\n\nsetmaxnreg.inc.sync.aligned.u32 192;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg"
            };

        case "setp":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp\" target=\"_blank\" rel=\"noopener noreferrer\">setp <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-comparison-instructions-setp\" target=\"_blank\" rel=\"noopener noreferrer\">setp <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Comparison and Selection Instructions: setp</h1><section id=\"comparison-and-selection-instructions-setp\">\n\n\n<p>Compare\u00a0two numeric values with a relational operator, and (optionally) combine this result with a\npredicate value by applying a Boolean operator.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>setp.CmpOp{.ftz}.type         p[|q], a, b;\nsetp.CmpOp.BoolOp{.ftz}.type  p[|q], a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge, lo, ls, hi, hs,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n.type   = { .b16, .b32, .b64,\n            .u16, .u32, .u64,\n            .s16, .s32, .s64,\n                  .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compares two values and combines the result with another predicate value by applying a Boolean\noperator. This result is written to the first destination operand. A related value computed using\nthe complement of the compare result is written to the second destination operand.</p>\n<p>Applies to all numeric types. Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>; operands <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">q</span></code>,\nand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>. The sink symbol \u2018_\u2019 may be used in place of any one of the\ndestination operands.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>t = (a CmpOp b) ? 1 : 0;\np = BoolOp(t, c);\nq = BoolOp(!t, c);\n</pre></div>\n</div>\n<p><strong>Integer Notes</strong></p>\n<p>The signed and unsigned comparison operators are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>.</p>\n<p>For unsigned values, the comparison operators <code class=\"docutils literal notranslate\"><span class=\"pre\">lo</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ls</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">hi</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">hs</span></code> for lower,\nlower-or-same, higher, and higher-or-same may be used instead of <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>,\nrespectively.</p>\n<p>The untyped, bit-size comparisons are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>.</p>\n<p><strong>Floating Point Notes</strong></p>\n<p>The ordered comparisons are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>. If either operand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>To aid comparison operations in the presence of <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> values, unordered versions are included:\n<code class=\"docutils literal notranslate\"><span class=\"pre\">equ</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">neu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ltu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">leu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gtu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">geu</span></code>. If both operands are numeric values (not\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), then these comparisons have the same result as their ordered counterparts. If either\noperand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, then the result of these comparisons is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">num</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if both operands are numeric values (not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), and <code class=\"docutils literal notranslate\"><span class=\"pre\">nan</span></code> returns\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if either operand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp.ftz.dtype.f32</span></code> flushes subnormal inputs to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp.dtype.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp.dtype.f32</span></code> flushes subnormal inputs to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> applies only to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> comparisons.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp</span></code> with <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> source type requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    setp.lt.and.s32  p|q,a,b,r;\n@q  setp.eq.u32      p,i,n;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Comparison Instructions: setp</h1><section id=\"half-precision-comparison-instructions-setp\">\n\n\n<p>Compare two numeric values with a relational operator, and optionally combine this result with a\npredicate value by applying a Boolean operator.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>setp.CmpOp{.ftz}.f16           p, a, b;\nsetp.CmpOp.BoolOp{.ftz}.f16    p, a, b, {!}c;\n\nsetp.CmpOp{.ftz}.f16x2         p|q, a, b;\nsetp.CmpOp.BoolOp{.ftz}.f16x2  p|q, a, b, {!}c;\n\nsetp.CmpOp.bf16                p, a, b;\nsetp.CmpOp.BoolOp.bf16         p, a, b, {!}c;\n\nsetp.CmpOp.bf16x2              p|q, a, b;\nsetp.CmpOp.BoolOp.bf16x2       p|q, a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge,\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n.BoolOp = { and, or, xor };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compares two values and combines the result with another predicate value by applying a Boolean\noperator. This result is written to the destination operand.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">q</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.pred</span></code>.</p>\n<p>For instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code>.</p>\n<p>For instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p>For instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code>.</p>\n<p>For instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == .f16 || type == .bf16) {\n     t = (a CmpOp b) ? 1 : 0;\n     p = BoolOp(t, c);\n} else if (type == .f16x2 || type == .bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    t[0] = (fA[0] CmpOp fB[0]) ? 1 : 0;\n    t[1] = (fA[1] CmpOp fB[1]) ? 1 : 0;\n    p = BoolOp(t[0], c);\n    q = BoolOp(t[1], c);\n}\n</pre></div>\n</div>\n<p><strong>Floating Point Notes</strong></p>\n<p>The ordered comparisons are <code class=\"docutils literal notranslate\"><span class=\"pre\">eq</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ne</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">lt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">le</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gt</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ge</span></code>. If either operand is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, the result is <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>To aid comparison operations in the presence of <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> values, unordered versions are included:\n<code class=\"docutils literal notranslate\"><span class=\"pre\">equ</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">neu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">ltu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">leu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">gtu</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">geu</span></code>. If both operands are numeric values (not\n<code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), then these comparisons have the same result as their ordered counterparts. If either\noperand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, then the result of these comparisons is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">num</span></code> returns <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if both operands are numeric values (not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>), and <code class=\"docutils literal notranslate\"><span class=\"pre\">nan</span></code> returns\n<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if either operand is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>.</p>\n<dl>\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp.ftz.{f16,f16x2}</span></code> flushes subnormal inputs to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp.{bf16/bf16x2}</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">setp.{bf16/bf16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>setp.lt.and.f16x2  p|q,a,b,r;\n@q  setp.eq.f16    p,i,n;\n\nsetp.gt.or.bf16x2  u|v,c,d,s;\n@q  setp.eq.bf16   u,j,m;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Comparison and Selection Instructions: setp\n\n\n\nCompare\u00a0two numeric values with a relational operator, and (optionally) combine this result with a\n\npredicate value by applying a Boolean operator.\n\nSyntax\n\nsetp.CmpOp{.ftz}.type         p[|q], a, b;\n\nsetp.CmpOp.BoolOp{.ftz}.type  p[|q], a, b, {!}c;\n\n.CmpOp  = { eq, ne, lt, le, gt, ge, lo, ls, hi, hs,\n\n            equ, neu, ltu, leu, gtu, geu, num, nan };\n\n.BoolOp = { and, or, xor };\n\n.type   = { .b16, .b...\n\n=====Half Precision Comparison Instructions: setp\n\n\n\nCompare two numeric values with a relational operator, and optionally combine this result with a\n\npredicate value by applying a Boolean operator.\n\nSyntax\n\nsetp.CmpOp{.ftz}.f16           p, a, b;\n\nsetp.CmpOp.BoolOp{.ftz}.f16    p, a, b, {!}c;\n\nsetp.CmpOp{.ftz}.f16x2         p|q, a, b;\n\nsetp.CmpOp.BoolOp{.ftz}.f16x2  p|q, a, b, {!}c;\n\nsetp.CmpOp.bf16                p, a, b;\n\nsetp.CmpOp.BoolOp.bf16         p... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-setp"
            };

        case "shf":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf\" target=\"_blank\" rel=\"noopener noreferrer\">shf <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: shf</h1><section id=\"logic-and-shift-instructions-shf\">\n\n\n<p>Funnel shift.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shf.l.mode.b32  d, a, b, c;  // left shift\nshf.r.mode.b32  d, a, b, c;  // right shift\n\n.mode = { .clamp, .wrap };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Shift the 64-bit value formed by concatenating operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> left or right by the amount\nspecified by the unsigned 32-bit value in <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> holds bits <code class=\"docutils literal notranslate\"><span class=\"pre\">63:32</span></code> and operand a\nholds bits <code class=\"docutils literal notranslate\"><span class=\"pre\">31:0</span></code> of the 64-bit source value. The source is shifted left or right by the clamped\nor wrapped value in <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>. For <code class=\"docutils literal notranslate\"><span class=\"pre\">shf.l</span></code>, the most-significant 32-bits of the result are written\ninto <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>; for <code class=\"docutils literal notranslate\"><span class=\"pre\">shf.r</span></code>, the least-significant 32-bits of the result are written into <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>u32  n = (.mode == .clamp) ? min(c, 32) : c &amp; 0x1f;\nswitch (shf.dir) {  // shift concatenation of [b, a]\n    case shf.l:     // extract 32 msbs\n           u32  d = (b &lt;&lt; n)      | (a &gt;&gt; (32-n));\n    case shf.r:     // extract 32 lsbs\n           u32  d = (b &lt;&lt; (32-n)) | (a &gt;&gt; n);\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Use funnel shift for multi-word shift operations and for rotate operations. The shift amount is\nlimited to the range <code class=\"docutils literal notranslate\"><span class=\"pre\">0..32</span></code> in clamp mode and <code class=\"docutils literal notranslate\"><span class=\"pre\">0..31</span></code> in wrap mode, so shifting multi-word\nvalues by distances greater than 32 requires first moving 32-bit words, then using <code class=\"docutils literal notranslate\"><span class=\"pre\">shf</span></code> to shift\nthe remaining <code class=\"docutils literal notranslate\"><span class=\"pre\">0..31</span></code> distance.</p>\n<p>To shift data sizes greater than 64 bits to the right, use repeated <code class=\"docutils literal notranslate\"><span class=\"pre\">shf.r</span></code> instructions applied\nto adjacent words, operating from least-significant word towards most-significant word. At each\nstep, a single word of the shifted result is computed. The most-significant word of the result is\ncomputed using a <code class=\"docutils literal notranslate\"><span class=\"pre\">shr.{u32,s32}</span></code> instruction, which zero or sign fills based on the instruction\ntype.</p>\n<p>To shift data sizes greater than 64 bits to the left, use repeated <code class=\"docutils literal notranslate\"><span class=\"pre\">shf.l</span></code> instructions applied to\nadjacent words, operating from most-significant word towards least-significant word. At each step, a\nsingle word of the shifted result is computed. The least-significant word of the result is computed\nusing a <code class=\"docutils literal notranslate\"><span class=\"pre\">shl</span></code> instruction.</p>\n<p>Use funnel shift to perform 32-bit left or right rotate by supplying the same value for source\narguments <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> or higher.</p>\n<p><strong>Example</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shf.l.clamp.b32  r3,r1,r0,16;\n\n// 128-bit left shift; n &lt; 32\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] &lt;&lt; n\nshf.l.clamp.b32  r7,r2,r3,n;\nshf.l.clamp.b32  r6,r1,r2,n;\nshf.l.clamp.b32  r5,r0,r1,n;\nshl.b32          r4,r0,n;\n\n// 128-bit right shift, arithmetic; n &lt; 32\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] &gt;&gt; n\nshf.r.clamp.b32  r4,r0,r1,n;\nshf.r.clamp.b32  r5,r1,r2,n;\nshf.r.clamp.b32  r6,r2,r3,n;\nshr.s32          r7,r3,n;     // result is sign-extended\n\nshf.r.clamp.b32  r1,r0,r0,n;  // rotate right by n; n &lt; 32\nshf.l.clamp.b32  r1,r0,r0,n;  // rotate left by n; n &lt; 32\n\n// extract 32-bits from [r1,r0] starting at position n &lt; 32\nshf.r.clamp.b32  r0,r0,r1,n;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Funnel shift.\n\nSyntax\n\nshf.l.mode.b32  d, a, b, c;  // left shift\n\nshf.r.mode.b32  d, a, b, c;  // right shift\n\n.mode = { .clamp, .wrap };\n\nDescription\n\nShift the 64-bit value formed by concatenating operands a and b left or right by the amount\n\nspecified by the unsigned 32-bit value in c. Operand b holds bits 63:32 and operand a\n\nholds bits 31:0 of the 64-bit source value. The source is shifted left or right by the clamped\n\nor wrapped value in c. For shf.l, the most-significant 32-bits of the result are written\n\ninto d; for shf.r, the least-significant 32-bits of the result are written into d.\n\nSemantics\n\nu32  n = (.mode == .clamp) ? min(c, 32) : c & 0x1f;\n\nswitch (shf.dir) {  // shift concatenation of [b, a]\n\n    case shf.l:     // extract 32 msbs\n\n           u32  d = (b << n)      | (a >> (32-n));\n\n    case shf.r:     // extract 32 lsbs\n\n           u32  d = (b << (32-n)) | (a >> n);\n\n}\n\nNotes\n\nUse funnel shift for multi-word shift operations and for rotate operations. The shift amount is\n\nlimited to the range 0..32 in clamp mode and 0..31 in wrap mode, so shifting multi-word\n\nvalues by distances greater than 32 requires first moving 32-bit words, then using shf to shift\n\nthe remaining 0..31 distance.\n\nTo shift data sizes greater than 64 bits to the right, use repeated shf.r instructions applied\n\nto adjacent words, operating from least-significant word towards most-significant word. At each\n\nstep, a single word of the shifted result is computed. The most-significant word of the result is\n\ncomputed using a shr.{u32,s32} instruction, which zero or sign fills based on the instruction\n\ntype.\n\nTo shift data sizes greater than 64 bits to the left, use repeated shf.l instructions applied to\n\nadjacent words, operating from most-significant word towards least-significant word. At each step, a\n\nsingle word of the shifted result is computed. The least-significant word of the result is computed\n\nusing a shl instruction.\n\nUse funnel shift to perform 32-bit left or right rotate by supplying the same value for source\n\narguments a and b.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nRequires sm_32 or higher.\n\nExample\n\nshf.l.clamp.b32  r3,r1,r0,16;\n\n// 128-bit left shift; n < 32\n\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] << n\n\nshf.l.clamp.b32  r7,r2,r3,n;\n\nshf.l.clamp.b32  r6,r1,r2,n;\n\nshf.l.clamp.b32  r5,r0,r1,n;\n\nshl.b32          r4,r0,n;\n\n// 128-bit right shift, arithmetic; n < 32\n\n// [r7,r6,r5,r4] = [r3,r2,r1,r0] >> n\n\nshf.r.clamp.b32  r4,r0,r1,n;\n\nshf.r.clamp.b32  r5,r1,r2,n;\n\nshf.r.clamp.b32  r6,r2,r3,n;\n\nshr.s32          r7,r3,n;     // result is sign-extended\n\nshf.r.clamp.b32  r1,r0,r0,n;  // rotate right by n; n < 32\n\nshf.l.clamp.b32  r1,r0,r0,n;  // rotate left by n; n < 32\n\n// extract 32-bits from [r1,r0] starting at position n < 32\n\nshf.r.clamp.b32  r0,r0,r1,n;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shf"
            };

        case "shfl":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync\" target=\"_blank\" rel=\"noopener noreferrer\">shfl.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: shfl.sync</h1><section id=\"data-movement-and-conversion-instructions-shfl-sync\">\n\n\n<p>Register data shuffle within threads of a warp.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shfl.sync.mode.b32  d[|p], a, b, c, membermask;\n\n.mode = { .up, .down, .bfly, .idx };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Exchange register data between threads of a warp.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">shfl.sync</span></code> will cause executing thread to wait until all non-exited threads corresponding to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have executed <code class=\"docutils literal notranslate\"><span class=\"pre\">shfl.sync</span></code> with the same qualifiers and same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value\nbefore resuming execution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin barrier where the bit position corresponds to thread\u2019s <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">shfl.sync</span></code> exchanges register data between threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p>Each thread in the currently executing warp will compute a source lane index <em>j</em> based on input\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> and the <em>mode</em>. If the computed source lane index <em>j</em> is in range, the\nthread will copy the input operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> from lane <em>j</em> into its own destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>;\notherwise, the thread will simply copy its own input <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> to destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. The optional\ndestination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the computed source lane is in range, and\notherwise set to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<p>Note that an out of range value of <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may still result in a valid computed source lane index\n<em>j</em>. In this case, a data transfer occurs and the destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is True.</p>\n<p>Note that results are undefined if a thread sources a register from an inactive thread or a thread\nthat is not in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> specifies a source lane or source lane offset, depending on the mode.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> contains two packed values specifying a mask for logically splitting warps into\nsub-segments and an upper bound for clamping the source lane index.</p>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">shfl.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below, all threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> must execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">shfl.sync</span></code>\ninstruction in convergence, and only threads belonging to some <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> can be active when\nthe <code class=\"docutils literal notranslate\"><span class=\"pre\">shfl.sync</span></code> instruction is executed. Otherwise, the behavior is undefined.</p>\n</div>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// wait for all threads in membermask to arrive\nwait_for_specified_threads(membermask);\n\nlane[4:0]  = [Thread].laneid;  // position of thread in warp\nbval[4:0] = b[4:0];            // source lane or lane offset (0..31)\ncval[4:0] = c[4:0];            // clamp value\nsegmask[4:0] = c[12:8];\n\n// get value of source register a if thread is active and\n// guard predicate true, else unpredictable\nif (isActive(Thread) &amp;&amp; isGuardPredicateTrue(Thread)) {\n    SourceA[lane] = a;\n} else {\n    // Value of SourceA[lane] is unpredictable for\n    // inactive/predicated-off threads in warp\n}\nmaxLane = (lane[4:0] &amp; segmask[4:0]) | (cval[4:0] &amp; ~segmask[4:0]);\nminLane = (lane[4:0] &amp; segmask[4:0]);\n\nswitch (.mode) {\n    case .up:    j = lane - bval; pval = (j &gt;= maxLane); break;\n    case .down:  j = lane + bval; pval = (j &lt;= maxLane); break;\n    case .bfly:  j = lane ^ bval; pval = (j &lt;= maxLane); break;\n    case .idx:   j = minLane  | (bval[4:0] &amp; ~segmask[4:0]);\n                                 pval = (j &lt;= maxLane); break;\n}\nif (!pval) j = lane;  // copy from own lane\nd = SourceA[j];       // copy input a from lane j\nif (dest predicate selected)\n    p = pval;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shfl.sync.up.b32  Ry|p, Rx, 0x1,  0x0, 0xffffffff;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Register data shuffle within threads of a warp.\n\nSyntax\n\nshfl.sync.mode.b32  d[|p], a, b, c, membermask;\n\n.mode = { .up, .down, .bfly, .idx };\n\nDescription\n\nExchange register data between threads of a warp.\n\nshfl.sync will cause executing thread to wait until all non-exited threads corresponding to\n\nmembermask have executed shfl.sync with the same qualifiers and same membermask value\n\nbefore resuming execution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin barrier where the bit position corresponds to thread\u2019s laneid.\n\nshfl.sync exchanges register data between threads in membermask.\n\nEach thread in the currently executing warp will compute a source lane index j based on input\n\noperands b and c and the mode. If the computed source lane index j is in range, the\n\nthread will copy the input operand a from lane j into its own destination register d;\n\notherwise, the thread will simply copy its own input a to destination d. The optional\n\ndestination predicate p is set to True if the computed source lane is in range, and\n\notherwise set to False.\n\nNote that an out of range value of b may still result in a valid computed source lane index\n\nj. In this case, a data transfer occurs and the destination predicate p is True.\n\nNote that results are undefined if a thread sources a register from an inactive thread or a thread\n\nthat is not in membermask.\n\nOperand b specifies a source lane or source lane offset, depending on the mode.\n\nOperand c contains two packed values specifying a mask for logically splitting warps into\n\nsub-segments and an upper bound for clamping the source lane index.\n\nThe behavior of shfl.sync is undefined if the executing thread is not in the membermask.\n\nNote\n\nFor .target sm_6x or below, all threads in membermask must execute the same shfl.sync\n\ninstruction in convergence, and only threads belonging to some membermask can be active when\n\nthe shfl.sync instruction is executed. Otherwise, the behavior is undefined.\n\nSemantics\n\n// wait for all threads in membermask to arrive\n\nwait_for_specified_threads(membermask);\n\nlane[4:0]  = [Thread].laneid;  // position of thread in warp\n\nbval[4:0] = b[4:0];            // source lane or lane offset (0..31)\n\ncval[4:0] = c[4:0];            // clamp value\n\nsegmask[4:0] = c[12:8];\n\n// get value of source register a if thread is active and\n\n// guard predicate true, else unpredictable\n\nif (isActive(Thread) && isGuardPredicateTrue(Thread)) {\n\n    SourceA[lane] = a;\n\n} else {\n\n    // Value of SourceA[lane] is unpredictable for\n\n    // inactive/predicated-off threads in warp\n\n}\n\nmaxLane = (lane[4:0] & segmask[4:0]) | (cval[4:0] & ~segmask[4:0]);\n\nminLane = (lane[4:0] & segmask[4:0]);\n\nswitch (.mode) {\n\n    case .up:    j = lane - bval; pval = (j >= maxLane); break;\n\n    case .down:  j = lane + bval; pval = (j <= maxLane); break;\n\n    case .bfly:  j = lane ^ bval; pval = (j <= maxLane); break;\n\n    case .idx:   j = minLane  | (bval[4:0] & ~segmask[4:0]);\n\n                                 pval = (j <= maxLane); break;\n\n}\n\nif (!pval) j = lane;  // copy from own lane\n\nd = SourceA[j];       // copy input a from lane j\n\nif (dest predicate selected)\n\n    p = pval;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nshfl.sync.up.b32  Ry|p, Rx, 0x1,  0x0, 0xffffffff;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync"
            };

        case "shl":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl\" target=\"_blank\" rel=\"noopener noreferrer\">shl <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: shl</h1><section id=\"logic-and-shift-instructions-shl\">\n\n\n<p>Shift bits left, zero-fill on right.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shl.type d, a, b;\n\n.type = { .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Shift <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> left by the amount specified by unsigned 32-bit value in <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a &lt;&lt; b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Shift amounts greater than the register width <em>N</em> are clamped to <em>N</em>.</p>\n<p>The sizes of the destination and first source operand must match, but not necessarily the type. The\n<code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operand must be a 32-bit value, regardless of the instruction type.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Example</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shl.b32  q,a,2;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Shift bits left, zero-fill on right.\n\nSyntax\n\nshl.type d, a, b;\n\n.type = { .b16, .b32, .b64 };\n\nDescription\n\nShift a left by the amount specified by unsigned 32-bit value in b.\n\nSemantics\n\nd = a << b;\n\nNotes\n\nShift amounts greater than the register width N are clamped to N.\n\nThe sizes of the destination and first source operand must match, but not necessarily the type. The\n\nb operand must be a 32-bit value, regardless of the instruction type.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExample\n\nshl.b32  q,a,2;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shl"
            };

        case "shr":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr\" target=\"_blank\" rel=\"noopener noreferrer\">shr <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: shr</h1><section id=\"logic-and-shift-instructions-shr\">\n\n\n<p>Shift bits right, sign or zero-fill on left.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shr.type d, a, b;\n\n.type = { .b16, .b32, .b64,\n          .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Shift <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> right by the amount specified by unsigned 32-bit value in <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. Signed shifts fill with\nthe sign bit, unsigned and untyped shifts fill with <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a &gt;&gt; b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Shift amounts greater than the register width <em>N</em> are clamped to <em>N</em>.</p>\n<p>The sizes of the destination and first source operand must match, but not necessarily the type. The\n<code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operand must be a 32-bit value, regardless of the instruction type.</p>\n<p>Bit-size types are included for symmetry with <code class=\"docutils literal notranslate\"><span class=\"pre\">shl</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Example</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>shr.u16  c,a,2;\nshr.s32  i,i,1;\nshr.b16  k,i,j;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Shift bits right, sign or zero-fill on left.\n\nSyntax\n\nshr.type d, a, b;\n\n.type = { .b16, .b32, .b64,\n\n          .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nShift a right by the amount specified by unsigned 32-bit value in b. Signed shifts fill with\n\nthe sign bit, unsigned and untyped shifts fill with 0.\n\nSemantics\n\nd = a >> b;\n\nNotes\n\nShift amounts greater than the register width N are clamped to N.\n\nThe sizes of the destination and first source operand must match, but not necessarily the type. The\n\nb operand must be a 32-bit value, regardless of the instruction type.\n\nBit-size types are included for symmetry with shl.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExample\n\nshr.u16  c,a,2;\n\nshr.s32  i,i,1;\n\nshr.b16  k,i,j;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-shr"
            };

        case "sin":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin\" target=\"_blank\" rel=\"noopener noreferrer\">sin(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: sin</h1><section id=\"floating-point-instructions-sin\">\n\n\n<p>Find the sine of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sin.approx{.ftz}.f32  d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Find the sine of the angle <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> (in radians).</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = sin(a);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sin.approx.f32</span></code> implements a fast approximation to sine.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error is 2<sup>-20.9</sup> in quadrant 00.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sin.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p>Subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sin.f32</span></code> introduced in PTX ISA version 1.0. Explicit modifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code>\nintroduced in PTX ISA version 1.4.</p>\n<p>For PTX ISA version 1.4 and later, the .approx modifier is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">sin.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">sin.approx.ftz.f32</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sin.approx.ftz.f32  sa, a;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Find the sine of a value.\n\nSyntax\n\nsin.approx{.ftz}.f32  d, a;\n\nDescription\n\nFind the sine of the angle a (in radians).\n\nSemantics\n\nd = sin(a);\n\nNotes\n\nsin.approx.f32 implements a fast approximation to sine.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-subnormal\n\n-0.0\n\n-0.0\n\n-0.0\n\n+0.0\n\n+0.0\n\n+subnormal\n\n+0.0\n\n+Inf\n\nNaN\n\nNaN\n\nNaN\n\nThe maximum absolute error is 2-20.9 in quadrant 00.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nsin.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xSubnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nsin.f32 introduced in PTX ISA version 1.0. Explicit modifiers .approx and .ftz\n\nintroduced in PTX ISA version 1.4.\n\nFor PTX ISA version 1.4 and later, the .approx modifier is required.\n\nFor PTX ISA versions 1.0 through 1.3, sin.f32 defaults to sin.approx.ftz.f32.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nsin.approx.ftz.f32  sa, a;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin"
            };

        case "slct":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct\" target=\"_blank\" rel=\"noopener noreferrer\">slct <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Comparison and Selection Instructions: slct</h1><section id=\"comparison-and-selection-instructions-slct\">\n\n\n<p>Select one source operand, based on the sign of the third operand.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>slct.dtype.s32        d, a, b, c;\nslct{.ftz}.dtype.f32  d, a, b, c;\n\n.dtype = { .b16, .b32, .b64,\n           .u16, .u32, .u64,\n           .s16, .s32, .s64,\n                 .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Conditional selection. If <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> \u2265 0, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is stored in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, otherwise <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is stored in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> are treated as a bitsize type of the same width as the first\ninstruction type; operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> must match the second instruction type (<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>). The\nselected input is copied to the output without modification.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = (c &gt;= 0) ? a : b;\n</pre></div>\n</div>\n<p><strong>Floating Point Notes</strong></p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> comparisons, negative zero equals zero.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">slct.ftz.dtype.f32</span></code> flushes subnormal values of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to sign-preserving zero, and\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is selected.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">slct.dtype.f32</span></code> flushes subnormal values of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to sign-preserving zero, and operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is selected.</p>\n</dd>\n</dl>\n<p>Modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> applies only to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> comparisons.</p>\n<p>If operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, the comparison is unordered and operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is selected.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">slct.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>slct.u32.s32  x, y, z, val;\nslct.ftz.u64.f32  A, B, C, fval;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Select one source operand, based on the sign of the third operand.\n\nSyntax\n\nslct.dtype.s32        d, a, b, c;\n\nslct{.ftz}.dtype.f32  d, a, b, c;\n\n.dtype = { .b16, .b32, .b64,\n\n           .u16, .u32, .u64,\n\n           .s16, .s32, .s64,\n\n                 .f32, .f64 };\n\nDescription\n\nConditional selection. If c \u2265 0, a is stored in d, otherwise b is stored in\n\nd. Operands d, a, and b are treated as a bitsize type of the same width as the first\n\ninstruction type; operand c must match the second instruction type (.s32 or .f32). The\n\nselected input is copied to the output without modification.\n\nSemantics\n\nd = (c >= 0) ? a : b;\n\nFloating Point Notes\n\nFor .f32 comparisons, negative zero equals zero.\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nslct.ftz.dtype.f32 flushes subnormal values of operand c to sign-preserving zero, and\n\noperand a is selected.\n\nsm_1xslct.dtype.f32 flushes subnormal values of operand c to sign-preserving zero, and operand\n\na is selected.\n\nModifier .ftz applies only to .f32 comparisons.\n\nIf operand c is NaN, the comparison is unordered and operand b is selected.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nslct.f64 requires sm_13 or higher.\n\nExamples\n\nslct.u32.s32  x, y, z, val;\n\nslct.ftz.u64.f32  A, B, C, fval;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-slct"
            };

        case "smid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-smid\" target=\"_blank\" rel=\"noopener noreferrer\">smid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %smid</h1><section id=\"special-registers-smid\">\n\n\n<p>SM identifier.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %smid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the processor (SM) identifier on which a\nparticular thread is executing. The SM identifier ranges from <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">%nsmid-1</span></code>. The SM\nidentifier numbering is not guaranteed to be contiguous.</p>\n<p><strong>Notes</strong></p>\n<p>Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">%smid</span></code> is volatile and returns the location of a thread at the moment when read, but\nits value may change during execution, e.g. due to rescheduling of threads following\npreemption. <code class=\"docutils literal notranslate\"><span class=\"pre\">%smid</span></code> is intended mainly to enable profiling and diagnostic code to sample and log\ninformation such as work place mapping and load distribution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %smid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "SM identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %smid;\n\nDescription\n\nA predefined, read-only special register that returns the processor (SM) identifier on which a\n\nparticular thread is executing. The SM identifier ranges from 0 to %nsmid-1. The SM\n\nidentifier numbering is not guaranteed to be contiguous.\n\nNotes\n\nNote that %smid is volatile and returns the location of a thread at the moment when read, but\n\nits value may change during execution, e.g. due to rescheduling of threads following\n\npreemption. %smid is intended mainly to enable profiling and diagnostic code to sample and log\n\ninformation such as work place mapping and load distribution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r, %smid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-smid"
            };

        case "sqrt":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt\" target=\"_blank\" rel=\"noopener noreferrer\">sqrt(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: sqrt</h1><section id=\"floating-point-instructions-sqrt\">\n\n\n<p>Take the square root of a value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sqrt.approx{.ftz}.f32  d, a; // fast, approximate square root\nsqrt.rnd{.ftz}.f32     d, a; // IEEE 754 compliant rounding\nsqrt.rnd.f64           d, a; // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute sqrt(<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>) and store the result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = sqrt(a);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.approx.f32</span></code> implements a fast approximation to square root.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 63%\"/>\n<col style=\"width: 38%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-normal</p></td>\n<td><p>NaN</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-subnormal</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-0.0</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+0.0</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+subnormal</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+Inf</p></td>\n<td><p>+Inf</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Square root with IEEE 754 compliant rounding:</strong></p>\n<p>Rounding modifiers (no default):</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.f32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.f64</span></code> introduced in PTX ISA version 1.0. <code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.rn.f64</span></code> and explicit\nmodifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.ftz</span></code> were introduced in PTX ISA version 1.4. General rounding\nmodifiers were added in PTX ISA version 2.0.</p>\n<p>For PTX ISA version 1.4 and later, one of <code class=\"docutils literal notranslate\"><span class=\"pre\">.approx</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.rnd</span></code> is required.</p>\n<p>For PTX ISA versions 1.0 through 1.3, <code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.f32</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.approx.ftz.f32</span></code>, and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.f64</span></code> defaults to <code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.rn.f64</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.approx.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.rnd.f32</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.rn.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span> <span class=\"pre\">map_f64_to_f32</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sqrt.{rz,rm,rp}.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sqrt.approx.ftz.f32  r,x;\nsqrt.rn.ftz.f32      r,x;\nsqrt.rn.f64          r,x;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Take the square root of a value.\n\nSyntax\n\nsqrt.approx{.ftz}.f32  d, a; // fast, approximate square root\n\nsqrt.rnd{.ftz}.f32     d, a; // IEEE 754 compliant rounding\n\nsqrt.rnd.f64           d, a; // IEEE 754 compliant rounding\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nCompute sqrt(a) and store the result in d.\n\nSemantics\n\nd = sqrt(a);\n\nNotes\n\nsqrt.approx.f32 implements a fast approximation to square root.\n\n\n\nInput\n\nResult\n\n-Inf\n\nNaN\n\n-normal\n\nNaN\n\n-subnormal\n\n-0.0\n\n-0.0\n\n-0.0\n\n+0.0\n\n+0.0\n\n+subnormal\n\n+0.0\n\n+Inf\n\n+Inf\n\nNaN\n\nNaN\n\nSquare root with IEEE 754 compliant rounding:\n\nRounding modifiers (no default):\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds towards negative infinity\n\n.rpmantissa LSB rounds towards positive infinity\n\nSubnormal numbers:\n\nsm_20+By default, subnormal numbers are supported.\n\nsqrt.ftz.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nsm_1xsqrt.f64 supports subnormal numbers.\n\nsqrt.f32 flushes subnormal inputs and results to sign-preserving zero.\n\nPTX ISA Notes\n\nsqrt.f32 and sqrt.f64 introduced in PTX ISA version 1.0. sqrt.rn.f64 and explicit\n\nmodifiers .approx and .ftz were introduced in PTX ISA version 1.4. General rounding\n\nmodifiers were added in PTX ISA version 2.0.\n\nFor PTX ISA version 1.4 and later, one of .approx or .rnd is required.\n\nFor PTX ISA versions 1.0 through 1.3, sqrt.f32 defaults to sqrt.approx.ftz.f32, and\n\nsqrt.f64 defaults to sqrt.rn.f64.\n\nTarget ISA Notes\n\nsqrt.approx.f32 supported on all target architectures.\n\nsqrt.rnd.f32 requires sm_20 or higher.\n\nsqrt.rn.f64 requires sm_13 or higher, or .target map_f64_to_f32.\n\nsqrt.{rz,rm,rp}.f64 requires sm_20 or higher.\n\nExamples\n\nsqrt.approx.ftz.f32  r,x;\n\nsqrt.rn.ftz.f32      r,x;\n\nsqrt.rn.f64          r,x;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sqrt"
            };

        case "st":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st\" target=\"_blank\" rel=\"noopener noreferrer\">st <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st-async\" target=\"_blank\" rel=\"noopener noreferrer\">st.async <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Data Movement and Conversion Instructions: st</h1><section id=\"data-movement-and-conversion-instructions-st\">\n\n\n<p>Store a register variable to an addressable state space variable.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>st{.weak}{.ss}{.cop}{.level::cache_hint}{.vec}.type   [a], b{, cache-policy};\nst{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n                                                      [a], b{, cache-policy};\nst.volatile{.ss}{.vec}.type                           [a], b;\nst.relaxed.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n                                                      [a], b{, cache-policy};\nst.release.scope{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n                                                      [a], b{, cache-policy};\n\n.ss =                       { .global, .local, .param, .shared{::cta, ::cluster} };\n.level::eviction_priority = { .L1::evict_normal, .L1::evict_unchanged,\n                              .L1::evict_first, .L1::evict_last, .L1::no_allocate };\n.level::cache_hint =        { .L2::cache_hint };\n.cop =                      { .wb, .cg, .cs, .wt };\n.sem =                      { .relaxed, .release };\n.scope =                    { .cta, .cluster, .gpu, .sys };\n.vec =                      { .v2, .v4 };\n.type =                     { .b8, .b16, .b32, .b64,\n                              .u8, .u16, .u32, .u64,\n                              .s8, .s16, .s32, .s64,\n                              .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Store the value of register variable <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> in the location specified by the destination address\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> in specified state space. If no state space is given, perform the store using <a class=\"reference external\" href=\"#generic-addressing\">Generic\nAddressing</a>. Stores to const memory are illegal.</p>\n<p>If no sub-qualifier is specified with <code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> state space, then <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> is assumed by default.</p>\n<p>Supported addressing modes for operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and alignment requirements are described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses\nas Operands</a></p>\n<p>Instruction <code class=\"docutils literal notranslate\"><span class=\"pre\">st.param</span></code> used for passing arguments to device function cannot be predicated. See\n<a class=\"reference external\" href=\"#parameter-state-space\">Parameter State Space</a> and <a class=\"reference external\" href=\"#function-declarations-and-definitions\">Function Declarations and\nDefinitions</a> for descriptions of the proper use\nof <code class=\"docutils literal notranslate\"><span class=\"pre\">st.param</span></code>.</p>\n<p>The qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> indicate memory synchronization as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.scope</span></code> qualifier\nindicates the set of threads with which an <code class=\"docutils literal notranslate\"><span class=\"pre\">st.relaxed</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">st.release</span></code> instruction can directly\nsynchronize<sup>1</sup>. The <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifier indicates a memory instruction with no\nsynchronization. The effects of this instruction become visible to other threads only when\nsynchronization is established by other means.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.volatile</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> qualifiers are mutually exclusive. When\nnone of these is specified, the <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifier is assumed by default.</p>\n<p>An <code class=\"docutils literal notranslate\"><span class=\"pre\">st.volatile</span></code> operation is always performed and it will not be reordered with respect to other\n<code class=\"docutils literal notranslate\"><span class=\"pre\">volatile</span></code> operations to the same memory location. <code class=\"docutils literal notranslate\"><span class=\"pre\">st.volatile</span></code> has the same memory\nsynchronization semantics as <code class=\"docutils literal notranslate\"><span class=\"pre\">st.relaxed.sys</span></code>.</p>\n<p>The qualifiers <code class=\"docutils literal notranslate\"><span class=\"pre\">.volatile</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> may be used only with <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> spaces and with generic addressing, where the address points to <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> or\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared</span></code> space. Cache operations are not permitted with these qualifiers.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> specifies the eviction policy that will be used during\nmemory access.</p>\n<p>When the optional argument <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is specified, the qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is\nrequired. The 64-bit operand <code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> specifies the cache eviction policy that may be used\nduring the memory access.</p>\n<p>The qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> is only supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space and for generic\naddressing where the address points to the <code class=\"docutils literal notranslate\"><span class=\"pre\">.global</span></code> state space.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">cache-policy</span></code> is a hint to the cache subsystem and may not always be respected. It is treated as\na performance hint only, and does not change the memory consistency behavior of the program.</p>\n<p><sup>1</sup> This synchronization is further extended to other threads through the transitive nature of\n<em>causality order</em>, as described in the memory consistency model.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a;                // named variable d\n*(&amp;a+immOffset) = b;            // variable-plus-offset\n*a = b;               // register\n*(a+immOffset) = b;   // register-plus-offset\n*(immAddr) = b;       // immediate address\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> must be in the <code class=\"docutils literal notranslate\"><span class=\"pre\">.reg</span></code> state space.</p>\n<p>A source register wider than the specified type may be used. The lower <code class=\"docutils literal notranslate\"><span class=\"pre\">n</span></code> bits corresponding to\nthe instruction-type width are stored to memory. See\n<a class=\"reference internal\" href=\"#operand-size-exceeding-instruction-type-size-relaxed-type-checking-rules-source-operands\"><span class=\"std std-numref\">Table 24</span></a>\nfor a description of these relaxed type-checking rules.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> data resulting from a <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instruction may be stored using <code class=\"docutils literal notranslate\"><span class=\"pre\">st.b16</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data may be stored using <code class=\"docutils literal notranslate\"><span class=\"pre\">st.b32</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>st introduced in PTX ISA version 1.0. <code class=\"docutils literal notranslate\"><span class=\"pre\">st.volatile</span></code> introduced in PTX ISA version 1.1.</p>\n<p>Generic addressing and cache operations introduced in PTX ISA version 2.0.</p>\n<p>Support for scope qualifier, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifiers introduced in PTX ISA\nversion 6.0.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifiers introduced in PTX\nISA version 7.4.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier introduced in PTX ISA version 7.8.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> sub-qualifiers introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">st.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Support for scope qualifier, <code class=\"docutils literal notranslate\"><span class=\"pre\">.relaxed</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.weak</span></code> qualifiers require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or\nhigher.</p>\n<p>Generic addressing requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Cache operations require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::eviction_priority</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.level::cache_hint</span></code> qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> or higher.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope qualifier requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cta</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Sub-qualifier <code class=\"docutils literal notranslate\"><span class=\"pre\">::cluster</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>st.global.f32    [a],b;\nst.local.b32     [q+4],a;\nst.global.v4.s32 [p],Q;\nst.local.b32     [q+-8],a; // negative offset\nst.local.s32     [100],r7; // immediate address\n\ncvt.f16.f32      %r,%r;    // %r is 32-bit register\nst.b16           [fs],%r;  // store lower\nst.global.relaxed.sys.u32 [gbl], %r0;\nst.shared.release.cta.u32 [sh], %r1;\nst.global.relaxed.cluster.u32 [gbl], %r2;\nst.shared::cta.release.cta.u32 [sh + 4], %r1;\nst.shared::cluster.u32 [sh + 8], %r1;\n\nst.global.L1::no_allocate.f32 [p], a;\n\ncreatepolicy.fractional.L2::evict_last.b64 cache-policy, 0.25;\nst.global.L2::cache_hint.b32  [a], b, cache-policy;\n</pre></div>\n</div>\n</section>\n<h1>Data Movement and Conversion Instructions: st.async</h1><section id=\"data-movement-and-conversion-instructions-st-async\">\n\n\n<p>Asynchronous store operation on shared memory.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>st.async{.weak}{.ss}{.completion_mechanism}{.vec}.type [a], b, [mbar];\n\n.ss   =                 { .shared::cluster };\n.type =                 { .b32, .b64,\n                          .u32, .u64,\n                          .s32, .s64,\n                          .f32, .f64 };\n.vec  =                 { .v2, .v4 };\n.completion_mechanism = { .mbarrier::complete_tx::bytes };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">st.async</span></code> is a non-blocking instruction which initiates an asynchronous store operation that\nstores the value specified by source operand register <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> to the destination memory location\nspecified by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>The modifier <code class=\"docutils literal notranslate\"><span class=\"pre\">.completion_mechanism</span></code> specifies that upon completion of the asynchronous operation,\n<a class=\"reference external\" href=\"#parallel-synchronization-and-communication-instructions-mbarrier-complete-tx-operation\">complete-tx</a>\noperation, with <code class=\"docutils literal notranslate\"><span class=\"pre\">completeCount</span></code> argument equal to amount of data stored in bytes, will be\nperformed on the <em>mbarrier object</em> specified by the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> represents destination address and must be a register or of the form <code class=\"docutils literal notranslate\"><span class=\"pre\">register</span> <span class=\"pre\">+</span>\n<span class=\"pre\">immOff</span></code> as described in <a class=\"reference external\" href=\"#addresses-as-operands\">Addresses as Operands</a>.</p>\n<p>The shared memory addresses of destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and the <em>mbarrier object</em> <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>, must\nmeet all of the following conditions:</p>\n<ul class=\"simple\">\n<li><p>They belong to the same CTA.</p></li>\n<li><p>They are different to the CTA of the executing thread but must be within the same cluster.</p></li>\n</ul>\n<p>Otherwise, the behavior is undefined.</p>\n<p>The state space of the address <code class=\"docutils literal notranslate\"><span class=\"pre\">{.ss}</span></code>, if specified, is applicable to both operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>. If not specified, then <a class=\"reference external\" href=\"#generic-addressing\">Generic Addressing</a> is used for\nboth <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">mbar</span></code>. If the generic addresses specified do not fall within the address window of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.shared::cluster</span></code> state space, then the behaviour is undefined.</p>\n<p>The store operation in <code class=\"docutils literal notranslate\"><span class=\"pre\">st.async</span></code> is treated as a weak memory operation and the <em>complete_tx</em>\noperation on the mbarrier has <code class=\"docutils literal notranslate\"><span class=\"pre\">.release</span></code> semantics at the <code class=\"docutils literal notranslate\"><span class=\"pre\">.cluster</span></code> scope as described in the\n<a class=\"reference external\" href=\"#memory-consistency-model\">Memory Consistency Model</a>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [addr], b, [mbar_addr]\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Data Movement and Conversion Instructions: st\n\n\n\nStore a register variable to an addressable state space variable.\n\nSyntax\n\nst{.weak}{.ss}{.cop}{.level::cache_hint}{.vec}.type   [a], b{, cache-policy};\n\nst{.weak}{.ss}{.level::eviction_priority}{.level::cache_hint}{.vec}.type\n\n                                                      [a], b{, cache-policy};\n\nst.volatile{.ss}{.vec}.type                           [a], b;\n\nst.relaxed.scope{.ss}{.level::evicti...\n\n=====Data Movement and Conversion Instructions: st.async\n\n\n\nAsynchronous store operation on shared memory.\n\nSyntax\n\nst.async{.weak}{.ss}{.completion_mechanism}{.vec}.type [a], b, [mbar];\n\n.ss   =                 { .shared::cluster };\n\n.type =                 { .b32, .b64,\n\n                          .u32, .u64,\n\n                          .s32, .s64,\n\n                          .f32, .f64 };\n\n.vec  =                 { .v2, .v4 };\n\n.completion_mechanism = { .mbarrier::... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st"
            };

        case "stackrestore":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore\" target=\"_blank\" rel=\"noopener noreferrer\">stackrestore <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Stack Manipulation Instructions: stackrestore</h1><section id=\"stack-manipulation-instructions-stackrestore\">\n\n\n<p>Update the stack pointer with a new value.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>stackrestore.type  a;\n\n.type = { .u32, .u64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Sets the current stack pointer to source register <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>When <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> is used with operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> written by a prior <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> instruction, it\nwill effectively restore the state of stack as it was before <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> was executed. Note that\nif <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> is used with an arbitrary value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, it may cause corruption of stack\npointer. This implies that the correct use of this feature requires that <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore.type</span> <span class=\"pre\">a</span></code> is\nused after <code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave.type</span> <span class=\"pre\">a</span></code> without redefining the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> between them.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> has the same type as the instruction type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>stackptr = a;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.3.</p>\n<dl class=\"simple\">\n<dt>Preview Feature:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> is a preview feature in PTX ISA version 7.3. All details are subject to change\nwith no guarantees of backward compatibility on future PTX ISA versions or SM architectures.</p>\n</dd>\n</dl>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_52</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .u32 ra;\nstacksave.u32 ra;\n// Code that may modify stack pointer\n...\nstackrestore.u32 ra;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Update the stack pointer with a new value.\n\nSyntax\n\nstackrestore.type  a;\n\n.type = { .u32, .u64 };\n\nDescription\n\nSets the current stack pointer to source register a.\n\nWhen stackrestore is used with operand a written by a prior stacksave instruction, it\n\nwill effectively restore the state of stack as it was before stacksave was executed. Note that\n\nif stackrestore is used with an arbitrary value of a, it may cause corruption of stack\n\npointer. This implies that the correct use of this feature requires that stackrestore.type a is\n\nused after stacksave.type a without redefining the value of a between them.\n\nOperand a has the same type as the instruction type.\n\nSemantics\n\nstackptr = a;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:stackrestore is a preview feature in PTX ISA version 7.3. All details are subject to change\n\nwith no guarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nstackrestore requires sm_52 or higher.\n\nExamples\n\n.reg .u32 ra;\n\nstacksave.u32 ra;\n\n// Code that may modify stack pointer\n\n...\n\nstackrestore.u32 ra;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stackrestore"
            };

        case "stacksave":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave\" target=\"_blank\" rel=\"noopener noreferrer\">stacksave <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Stack Manipulation Instructions: stacksave</h1><section id=\"stack-manipulation-instructions-stacksave\">\n\n\n<p>Save the value of stack pointer into a register.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>stacksave.type  d;\n\n.type = { .u32, .u64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Copies the current value of stack pointer into the destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Pointer returned by\n<code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> can be used in a subsequent <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> instruction to restore the stack\npointer. If <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is modified prior to use in <code class=\"docutils literal notranslate\"><span class=\"pre\">stackrestore</span></code> instruction, it may corrupt data in\nthe stack.</p>\n<p>Destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> has the same type as the instruction type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = stackptr;\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.3.</p>\n<dl class=\"simple\">\n<dt>Preview Feature:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> is a preview feature in PTX ISA version 7.3. All details are subject to change with\nno guarantees of backward compatibility on future PTX ISA versions or SM architectures.</p>\n</dd>\n</dl>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">stacksave</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_52</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.reg .u32 rd;\nstacksave.u32 rd;\n\n.reg .u64 rd1;\nstacksave.u64 rd1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Save the value of stack pointer into a register.\n\nSyntax\n\nstacksave.type  d;\n\n.type = { .u32, .u64 };\n\nDescription\n\nCopies the current value of stack pointer into the destination register d. Pointer returned by\n\nstacksave can be used in a subsequent stackrestore instruction to restore the stack\n\npointer. If d is modified prior to use in stackrestore instruction, it may corrupt data in\n\nthe stack.\n\nDestination operand d has the same type as the instruction type.\n\nSemantics\n\nd = stackptr;\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.3.\n\nPreview Feature:stacksave is a preview feature in PTX ISA version 7.3. All details are subject to change with\n\nno guarantees of backward compatibility on future PTX ISA versions or SM architectures.\n\nTarget ISA Notes\n\nstacksave requires sm_52 or higher.\n\nExamples\n\n.reg .u32 rd;\n\nstacksave.u32 rd;\n\n.reg .u64 rd1;\n\nstacksave.u64 rd1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-stacksave"
            };

        case "sub":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub\" target=\"_blank\" rel=\"noopener noreferrer\">sub(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-sub\" target=\"_blank\" rel=\"noopener noreferrer\">sub(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sub\" target=\"_blank\" rel=\"noopener noreferrer\">sub(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-sub-cc\" target=\"_blank\" rel=\"noopener noreferrer\">sub.cc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: sub</h1><section id=\"floating-point-instructions-sub\">\n\n\n<p>Subtract one value from another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sub{.rnd}{.ftz}{.sat}.f32  d, a, b;\nsub{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs subtraction and writes the resulting value into a destination register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a - b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>mantissa LSB rounds towards zero</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code></dt><dd><p>mantissa LSB rounds towards negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>mantissa LSB rounds towards positive infinity</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that a <code class=\"docutils literal notranslate\"><span class=\"pre\">sub</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A <code class=\"docutils literal notranslate\"><span class=\"pre\">sub</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">sub</span></code> sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.</p>\n<p>Subnormal numbers:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20+</span></code></dt><dd><p>By default, subnormal numbers are supported.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.ftz.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.f64</span></code> supports subnormal numbers.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.f32</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n</dl>\n<p>Saturation modifier:</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.sat.f32</span></code> clamps the result to [0.0, 1.0]. NaN results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.f32</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.f64</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>Rounding modifiers have the following target requirements:</p>\n<dl>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rz</span></code></dt><dd><p>available for all targets</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rm</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.rp</span></code></dt><dd><p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">sub.f64</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> or higher.</p>\n<p>for <code class=\"docutils literal notranslate\"><span class=\"pre\">sub.f32</span></code>, requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n</dd>\n</dl>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sub.f32 c,a,b;\nsub.rn.ftz.f32  f1,f2,f3;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: sub</h1><section id=\"half-precision-floating-point-instructions-sub\">\n\n\n<p>Subtract two values.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sub{.rnd}{.ftz}{.sat}.f16   d, a, b;\nsub{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nsub{.rnd}.bf16   d, a, b;\nsub{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs subtraction and writes the resulting value into a destination register.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, forms input vectors by half word values from source\noperands. Half-word operands are then subtracted in parallel to produce <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>\nresult in destination.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> instruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>\ninstruction type, operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b16</span></code> type. For <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type,\noperands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (type == f16 || type == bf16) {\n    d = a - b;\n} else if (type == f16x2 || type == bf16x2) {\n    fA[0] = a[0:15];\n    fA[1] = a[16:31];\n    fB[0] = b[0:15];\n    fB[1] = b[16:31];\n    for (i = 0; i &lt; 2; i++) {\n         d[i] = fA[i] - fB[i];\n    }\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Rounding modifiers:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code></dt><dd><p>mantissa LSB rounds to nearest even</p>\n</dd>\n</dl>\n<p>The default value of rounding modifier is <code class=\"docutils literal notranslate\"><span class=\"pre\">.rn</span></code>. Note that a <code class=\"docutils literal notranslate\"><span class=\"pre\">sub</span></code> instruction with an explicit\nrounding modifier is treated conservatively by the code optimizer. A <code class=\"docutils literal notranslate\"><span class=\"pre\">sub</span></code> instruction with no\nrounding modifier defaults to round-to-nearest-even and may be optimized aggressively by the code\noptimizer. In particular, <code class=\"docutils literal notranslate\"><span class=\"pre\">mul</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">sub</span></code> sequences with no rounding modifiers may be optimized to\nuse fused-multiply-add instructions on the target device.</p>\n<dl class=\"simple\">\n<dt>Subnormal numbers:</dt><dd><p>By default, subnormal numbers are supported.\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sub.ftz.{f16,</span> <span class=\"pre\">f16x2}</span></code> flushes subnormal inputs and results to sign-preserving zero.</p>\n</dd>\n<dt>Saturation modifier:</dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub.sat.{f16,</span> <span class=\"pre\">f16x2}</span></code> clamps the result to [0.0, 1.0]. <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code> results are flushed to <code class=\"docutils literal notranslate\"><span class=\"pre\">+0.0f</span></code>.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.2.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sub{.rnd}.bf16x2</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sub{.rnd}.bf16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sub{.rnd}.bf16x2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// scalar f16 subtractions\nsub.f16        d0, a0, b0;\nsub.rn.f16     d1, a1, b1;\nsub.bf16       bd0, ba0, bb0;\nsub.rn.bf16    bd1, ba1, bb1;\n\n// SIMD f16 subtraction\ncvt.rn.f16.f32 h0, f0;\ncvt.rn.f16.f32 h1, f1;\ncvt.rn.f16.f32 h2, f2;\ncvt.rn.f16.f32 h3, f3;\nmov.b32  p1, {h0, h1};   // pack two f16 to 32bit f16x2\nmov.b32  p2, {h2, h3};   // pack two f16 to 32bit f16x2\nsub.f16x2  p3, p1, p2;   // SIMD f16x2 subtraction\n\n// SIMD bf16 subtraction\ncvt.rn.bf16x2.f32 p4, f4, f5; // Convert two f32 into packed bf16x2\ncvt.rn.bf16x2.f32 p5, f6, f7; // Convert two f32 into packed bf16x2\nsub.bf16x2  p6, p4, p5;       // SIMD bf16x2 subtraction\n\n// SIMD fp16 subtraction\nld.global.b32   f0, [addr];     // load 32 bit which hold packed f16x2\nld.global.b32   f1, [addr + 4]; // load 32 bit which hold packed f16x2\nsub.f16x2       f2, f0, f1;     // SIMD f16x2 subtraction\n\n// SIMD bf16 subtraction\nld.global.b32   f3, [addr + 8];  // load 32 bit which hold packed bf16x2\nld.global.b32   f4, [addr + 12]; // load 32 bit which hold packed bf16x2\nsub.bf16x2      f5, f3, f4;      // SIMD bf16x2 subtraction\n</pre></div>\n</div>\n</section>\n<h1>Integer Arithmetic Instructions: sub</h1><section id=\"integer-arithmetic-instructions-sub\">\n\n\n<p>Subtract one value from another.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sub.type       d, a, b;\nsub{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n          .s16, .s32, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs subtraction and writes the resulting value into a destination register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a - b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>Saturation modifier:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.sat</span></code></dt><dd><p>limits result to <code class=\"docutils literal notranslate\"><span class=\"pre\">MININT..MAXINT</span></code> (no overflow) for the size of the operation. Applies only to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> type.</p>\n</dd>\n</dl>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sub.s32 c,a,b;\n</pre></div>\n</div>\n</section>\n<h1>Extended-Precision Arithmetic Instructions: sub.cc</h1><section id=\"extended-precision-arithmetic-instructions-sub-cc\">\n\n\n<p>Subtract one value from another, with borrow-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sub.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs integer subtraction and writes the borrow-out value into the condition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a - b;\n</pre></div>\n</div>\n<p>borrow-out written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>No integer rounding modifiers.</p>\n<p>No saturation.</p>\n<p>Behavior is the same for unsigned and signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">sub.cc</span></code> introduced in PTX ISA version 1.2.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">sub.cc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">sub.cc</span></code> is supported on all target architectures.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">sub.cc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  sub.cc.u32   x1,y1,z1;   // extended-precision subtraction\n@p  subc.cc.u32  x2,y2,z2;   // of two 128-bit values\n@p  subc.cc.u32  x3,y3,z3;\n@p  subc.u32     x4,y4,z4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: sub\n\n\n\nSubtract one value from another.\n\nSyntax\n\nsub{.rnd}{.ftz}{.sat}.f32  d, a, b;\n\nsub{.rnd}.f64              d, a, b;\n\n.rnd = { .rn, .rz, .rm, .rp };\n\nDescription\n\nPerforms subtraction and writes the resulting value into a destination register.\n\nSemantics\n\nd = a - b;\n\nNotes\n\nRounding modifiers:\n\n.rnmantissa LSB rounds to nearest even\n\n.rzmantissa LSB rounds towards zero\n\n.rmmantissa LSB rounds towards negative in...\n\n=====Half Precision Floating Point Instructions: sub\n\n\n\nSubtract two values.\n\nSyntax\n\nsub{.rnd}{.ftz}{.sat}.f16   d, a, b;\n\nsub{.rnd}{.ftz}{.sat}.f16x2 d, a, b;\n\nsub{.rnd}.bf16   d, a, b;\n\nsub{.rnd}.bf16x2 d, a, b;\n\n.rnd = { .rn };\n\nDescription\n\nPerforms subtraction and writes the resulting value into a destination register.\n\nFor .f16x2 and .bf16x2 instruction type, forms input vectors by half word values from source\n\noperands. Half-word operands are then subtra...\n\n=====Integer Arithmetic Instructions: sub\n\n\n\nSubtract one value from another.\n\nSyntax\n\nsub.type       d, a, b;\n\nsub{.sat}.s32  d, a, b;     // .sat applies only to .s32\n\n.type = { .u16, .u32, .u64,\n\n          .s16, .s32, .s64 };\n\nDescription\n\nPerforms subtraction and writes the resulting value into a destination register.\n\nSemantics\n\nd = a - b;\n\nNotes\n\nSaturation modifier:\n\n.satlimits result to MININT..MAXINT (no overflow) for the size of the operation....\n\n=====Extended-Precision Arithmetic Instructions: sub.cc\n\n\n\nSubtract one value from another, with borrow-out.\n\nSyntax\n\nsub.cc.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer subtraction and writes the borrow-out value into the condition code register.\n\nSemantics\n\nd = a - b;\n\nborrow-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit s... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sub"
            };

        case "subc":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc\" target=\"_blank\" rel=\"noopener noreferrer\">subc <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Extended-Precision Arithmetic Instructions: subc</h1><section id=\"extended-precision-arithmetic-instructions-subc\">\n\n\n<p>Subtract one value from another, with borrow-in and optional borrow-out.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>subc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Performs integer subtraction with borrow-in and optionally writes the borrow-out value into the\ncondition code register.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a  - (b + CC.CF);\n</pre></div>\n</div>\n<p>if <code class=\"docutils literal notranslate\"><span class=\"pre\">.cc</span></code> specified, borrow-out written to <code class=\"docutils literal notranslate\"><span class=\"pre\">CC.CF</span></code></p>\n<p><strong>Notes</strong></p>\n<p>No integer rounding modifiers.</p>\n<p>No saturation.</p>\n<p>Behavior is the same for unsigned and signed integers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">subc</span></code> introduced in PTX ISA version 1.2.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">subc</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>32-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">subc</span></code> is supported on all target architectures.</p>\n<p>64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">subc</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>@p  sub.cc.u32   x1,y1,z1;   // extended-precision subtraction\n@p  subc.cc.u32  x2,y2,z2;   // of two 128-bit values\n@p  subc.cc.u32  x3,y3,z3;\n@p  subc.u32     x4,y4,z4;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Subtract one value from another, with borrow-in and optional borrow-out.\n\nSyntax\n\nsubc{.cc}.type  d, a, b;\n\n.type = { .u32, .s32, .u64, .s64 };\n\nDescription\n\nPerforms integer subtraction with borrow-in and optionally writes the borrow-out value into the\n\ncondition code register.\n\nSemantics\n\nd = a  - (b + CC.CF);\n\nif .cc specified, borrow-out written to CC.CF\n\nNotes\n\nNo integer rounding modifiers.\n\nNo saturation.\n\nBehavior is the same for unsigned and signed integers.\n\nPTX ISA Notes\n\n32-bit subc introduced in PTX ISA version 1.2.\n\n64-bit subc introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\n32-bit subc is supported on all target architectures.\n\n64-bit subc requires sm_20 or higher.\n\nExamples\n\n@p  sub.cc.u32   x1,y1,z1;   // extended-precision subtraction\n\n@p  subc.cc.u32  x2,y2,z2;   // of two 128-bit values\n\n@p  subc.cc.u32  x3,y3,z3;\n\n@p  subc.u32     x4,y4,z4;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-subc"
            };

        case "suld":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld\" target=\"_blank\" rel=\"noopener noreferrer\">suld <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Surface Instructions: suld</h1><section id=\"surface-instructions-suld\">\n\n\n<p>Load from surface memory.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>suld.b.geom{.cop}.vec.dtype.clamp  d, [a, b];  // unformatted\n\n.geom  = { .1d, .2d, .3d, .a1d, .a2d };\n.cop   = { .ca, .cg, .cs, .cv };               // cache operation\n.vec   = { none, .v2, .v4 };\n.dtype = { .b8 , .b16, .b32, .b64 };\n.clamp = { .trap, .clamp, .zero };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b.{1d,2d,3d}</span></code></p>\n<p>Load from surface memory using a surface coordinate vector. The instruction loads data from the\nsurface named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> at coordinates given by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> into destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a scalar or singleton tuple\nfor 1d surfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d\nsurfaces, where the fourth element is ignored. Coordinate elements are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b</span></code> performs an unformatted load of binary data. The lowest dimension coordinate represents a\nbyte offset into the surface and is not scaled, and the size of the data transfer matches the size\nof destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b.{a1d,a2d}</span></code></p>\n<p>Surface layer selection, followed by a load from the selected surface. The instruction first selects\na surface layer from the surface array named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> using the index given by the first\nelement of the array coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. The instruction then loads data from the selected\nsurface at coordinates given by the remaining elements of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a bit-size\ntype vector or tuple containing an index into the array of surfaces followed by coordinates within\nthe selected surface, as follows:</p>\n<p>For 1d surface arrays, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.b32</span></code>. The first element is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the surface array, and the second element is interpreted as a\n1d surface coordinate of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p>For 2d surface arrays, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.b32</span></code>. The first element is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the surface array, and the next two elements are interpreted\nas 2d surface coordinates of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>. The fourth element is ignored.</p>\n<p>A surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code> field specifies how to handle out-of-bounds addresses:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.trap</span></code></dt><dd><p>causes an execution trap on out-of-bounds addresses</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code></dt><dd><p>loads data at the nearest surface location (sized appropriately)</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.zero</span></code></dt><dd><p>loads zero for out-of-bounds addresses</p>\n</dd>\n</dl>\n<p><strong>Indirect surface access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding the address of\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b.trap</span></code> introduced in PTX ISA version 1.5.</p>\n<p>Additional clamp modifiers and cache operations introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b.3d</span></code> and<code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b.{a1d,a2d}</span></code> introduced in PTX ISA version 3.0.</p>\n<p>Indirect surface access introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.b</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets support only the <code class=\"docutils literal notranslate\"><span class=\"pre\">.trap</span></code> clamping modifier.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">suld.3d</span></code> and<code class=\"docutils literal notranslate\"><span class=\"pre\">suld.{a1d,a2d}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Indirect surface access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Cache operations require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>suld.b.1d.v4.b32.trap  {s1,s2,s3,s4}, [surf_B, {x}];\nsuld.b.3d.v2.b64.trap  {r1,r2}, [surf_A, {x,y,z,w}];\nsuld.b.a1d.v2.b32      {r0,r1}, [surf_C, {idx,x}];\nsuld.b.a2d.b32         r0, [surf_D, {idx,x,y,z}];  // z ignored\n</pre></div>\n</div>\n</section>",
                "tooltip": "Load from surface memory.\n\nSyntax\n\nsuld.b.geom{.cop}.vec.dtype.clamp  d, [a, b];  // unformatted\n\n.geom  = { .1d, .2d, .3d, .a1d, .a2d };\n\n.cop   = { .ca, .cg, .cs, .cv };               // cache operation\n\n.vec   = { none, .v2, .v4 };\n\n.dtype = { .b8 , .b16, .b32, .b64 };\n\n.clamp = { .trap, .clamp, .zero };\n\nDescription\n\nsuld.b.{1d,2d,3d}\n\nLoad from surface memory using a surface coordinate vector. The instruction loads data from the\n\nsurface named by operand a at coordinates given by operand b into destination d. Operand\n\na is a .surfref variable or .u64 register. Operand b is a scalar or singleton tuple\n\nfor 1d surfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d\n\nsurfaces, where the fourth element is ignored. Coordinate elements are of type .s32.\n\nsuld.b performs an unformatted load of binary data. The lowest dimension coordinate represents a\n\nbyte offset into the surface and is not scaled, and the size of the data transfer matches the size\n\nof destination operand d.\n\nsuld.b.{a1d,a2d}\n\nSurface layer selection, followed by a load from the selected surface. The instruction first selects\n\na surface layer from the surface array named by operand a using the index given by the first\n\nelement of the array coordinate vector b. The instruction then loads data from the selected\n\nsurface at coordinates given by the remaining elements of operand b into destination\n\nd. Operand a is a .surfref variable or .u64 register. Operand b is a bit-size\n\ntype vector or tuple containing an index into the array of surfaces followed by coordinates within\n\nthe selected surface, as follows:\n\nFor 1d surface arrays, operand b has type .v2.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the second element is interpreted as a\n\n1d surface coordinate of type .s32.\n\nFor 2d surface arrays, operand b has type .v4.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the next two elements are interpreted\n\nas 2d surface coordinates of type .s32. The fourth element is ignored.\n\nA surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\nThe .clamp field specifies how to handle out-of-bounds addresses:\n\n.trapcauses an execution trap on out-of-bounds addresses\n\n.clamploads data at the nearest surface location (sized appropriately)\n\n.zeroloads zero for out-of-bounds addresses\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nsuld.b.trap introduced in PTX ISA version 1.5.\n\nAdditional clamp modifiers and cache operations introduced in PTX ISA version 2.0.\n\nsuld.b.3d andsuld.b.{a1d,a2d} introduced in PTX ISA version 3.0.\n\nIndirect surface access introduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nsuld.b supported on all target architectures.\n\nsm_1x targets support only the .trap clamping modifier.\n\nsuld.3d andsuld.{a1d,a2d} require sm_20 or higher.\n\nIndirect surface access requires sm_20 or higher.\n\nCache operations require sm_20 or higher.\n\nExamples\n\nsuld.b.1d.v4.b32.trap  {s1,s2,s3,s4}, [surf_B, {x}];\n\nsuld.b.3d.v2.b64.trap  {r1,r2}, [surf_A, {x,y,z,w}];\n\nsuld.b.a1d.v2.b32      {r0,r1}, [surf_C, {idx,x}];\n\nsuld.b.a2d.b32         r0, [surf_D, {idx,x,y,z}];  // z ignored\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld"
            };

        case "suq":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq\" target=\"_blank\" rel=\"noopener noreferrer\">suq <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Surface Instructions: suq</h1><section id=\"surface-instructions-suq\">\n\n\n<p>Query a surface attribute.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>suq.query.b32   d, [a];\n\n.query = { .width, .height, .depth,\n           .channel_data_type, .channel_order,\n           .array_size, .memory_layout };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Query an attribute of a surface. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable or a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 12%\"/>\n<col style=\"width: 88%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Query</p></th>\n<th class=\"head\"><p>Returns</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.width</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.height</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.depth</span></code></p>\n</td>\n<td><p>value in elements</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.channel_data_type</span></code></p></td>\n<td><p>Unsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_data_type</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_order</span></code> queries.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.channel_order</span></code></p></td>\n<td><p>Unsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_data_type</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_order</span></code> queries.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.array_size</span></code></p></td>\n<td><p>For a surface array, number of surfaces in array, 0 otherwise.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.memory_layout</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> for surface with linear memory layout; <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> otherwise</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>Indirect surface access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding the address of\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.5.</p>\n<p>Channel data type and channel order queries added in PTX ISA version 2.1.</p>\n<p>Indirect surface access introduced in PTX ISA version 3.1.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.array_size</span></code> query was added in PTX ISA version 4.1.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.memory_layout</span></code> query was added in PTX ISA version 4.2.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p>Indirect surface access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>suq.width.b32       %r1, [surf_A];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Query a surface attribute.\n\nSyntax\n\nsuq.query.b32   d, [a];\n\n.query = { .width, .height, .depth,\n\n           .channel_data_type, .channel_order,\n\n           .array_size, .memory_layout };\n\nDescription\n\nQuery an attribute of a surface. Operand a is a .surfref variable or a .u64 register.\n\n\n\nQuery\n\nReturns\n\n.width\n\n.height\n\n.depth\n\nvalue in elements\n\n.channel_data_type\n\nUnsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.channel_order\n\nUnsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.array_size\n\nFor a surface array, number of surfaces in array, 0 otherwise.\n\n.memory_layout\n\n1 for surface with linear memory layout; 0 otherwise\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.5.\n\nChannel data type and channel order queries added in PTX ISA version 2.1.\n\nIndirect surface access introduced in PTX ISA version 3.1.\n\nThe .array_size query was added in PTX ISA version 4.1.\n\nThe .memory_layout query was added in PTX ISA version 4.2.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nIndirect surface access requires sm_20 or higher.\n\nExamples\n\nsuq.width.b32       %r1, [surf_A];\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suq"
            };

        case "sured":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured\" target=\"_blank\" rel=\"noopener noreferrer\">sured <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Surface Instructions: sured</h1><section id=\"surface-instructions-sured\">\n\n\n<p>Reduce surface memory.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sured.b.op.geom.ctype.clamp  [a,b],c; // byte addressing\nsured.p.op.geom.ctype.clamp  [a,b],c; // sample addressing\n\n.op    = { .add, .min, .max, .and, .or };\n.geom  = { .1d, .2d, .3d };\n.ctype = { .u32, .u64, .s32, .b32, .s64 };  // for sured.b\n.ctype = { .b32, .b64 };                    // for sured.p\n.clamp = { .trap, .clamp, .zero };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Reduction to surface memory using a surface coordinate vector. The instruction performs a reduction\noperation with data from operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to the surface named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> at coordinates given by\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a\nscalar or singleton tuple for 1d surfaces; is a two-element vector for 2d surfaces; and is a\nfour-element vector for 3d surfaces, where the fourth element is ignored. Coordinate elements are of\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sured.b</span></code> performs an unformatted reduction on <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>\ndata. The lowest dimension coordinate represents a byte offset into the surface and is not\nscaled. Operation <code class=\"docutils literal notranslate\"><span class=\"pre\">add</span></code> applies to <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> types; <code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code>\napply to <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code> types; operations <code class=\"docutils literal notranslate\"><span class=\"pre\">and</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">or</span></code> apply to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> type.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sured.p</span></code> performs a reduction on sample-addressed data. The lowest dimension coordinate\nrepresents a sample offset rather than a byte offset. The instruction type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code> is restricted to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code> operations. For type <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code>, the data is interpreted as <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>\nbased on the surface sample format as follows: if the surface format contains <code class=\"docutils literal notranslate\"><span class=\"pre\">UINT</span></code> data, then\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> is assumed; if the surface format contains <code class=\"docutils literal notranslate\"><span class=\"pre\">SINT</span></code> data, then <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> is assumed. For\ntype <code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code>, if the surface format contains <code class=\"docutils literal notranslate\"><span class=\"pre\">UINT</span></code> data, then <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> is assumed; if the\nsurface format contains <code class=\"docutils literal notranslate\"><span class=\"pre\">SINT</span></code> data, then <code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code> is assumed.</p>\n<p>A surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code> field specifies how to handle out-of-bounds addresses:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.trap</span></code></dt><dd><p>causes an execution trap on out-of-bounds addresses</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code></dt><dd><p>stores data at the nearest surface location (sized appropriately)</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.zero</span></code></dt><dd><p>drops stores to out-of-bounds addresses</p>\n</dd>\n</dl>\n<p><strong>Indirect surface access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding the address of\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p>Indirect surface access introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code> types with <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code> operations introduced in PTX ISA version\n8.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>sured requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Indirect surface access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.s64</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.b64</span></code> types with <code class=\"docutils literal notranslate\"><span class=\"pre\">.min</span></code>/<code class=\"docutils literal notranslate\"><span class=\"pre\">.max</span></code> operations requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sured.b.add.2d.u32.trap  [surf_A, {x,y}], r1;\nsured.p.min.1d.u32.trap  [surf_B, {x}], r1;\nsured.b.max.1d.u64.trap  [surf_C, {x}], r1;\nsured.p.min.1d.b64.trap  [surf_D, {x}], r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Reduce surface memory.\n\nSyntax\n\nsured.b.op.geom.ctype.clamp  [a,b],c; // byte addressing\n\nsured.p.op.geom.ctype.clamp  [a,b],c; // sample addressing\n\n.op    = { .add, .min, .max, .and, .or };\n\n.geom  = { .1d, .2d, .3d };\n\n.ctype = { .u32, .u64, .s32, .b32, .s64 };  // for sured.b\n\n.ctype = { .b32, .b64 };                    // for sured.p\n\n.clamp = { .trap, .clamp, .zero };\n\nDescription\n\nReduction to surface memory using a surface coordinate vector. The instruction performs a reduction\n\noperation with data from operand c to the surface named by operand a at coordinates given by\n\noperand b. Operand a is a .surfref variable or .u64 register. Operand b is a\n\nscalar or singleton tuple for 1d surfaces; is a two-element vector for 2d surfaces; and is a\n\nfour-element vector for 3d surfaces, where the fourth element is ignored. Coordinate elements are of\n\ntype .s32.\n\nsured.b performs an unformatted reduction on .u32, .s32, .b32, .u64, or .s64\n\ndata. The lowest dimension coordinate represents a byte offset into the surface and is not\n\nscaled. Operation add applies to .u32, .u64, and .s32 types; min and max\n\napply to .u32, .s32, .u64 and .s64 types; operations and and or apply to\n\n.b32 type.\n\nsured.p performs a reduction on sample-addressed data. The lowest dimension coordinate\n\nrepresents a sample offset rather than a byte offset. The instruction type .b64 is restricted to\n\nmin and max operations. For type .b32, the data is interpreted as .u32 or .s32\n\nbased on the surface sample format as follows: if the surface format contains UINT data, then\n\n.u32 is assumed; if the surface format contains SINT data, then .s32 is assumed. For\n\ntype .b64, if the surface format contains UINT data, then .u64 is assumed; if the\n\nsurface format contains SINT data, then .s64 is assumed.\n\nA surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\nThe .clamp field specifies how to handle out-of-bounds addresses:\n\n.trapcauses an execution trap on out-of-bounds addresses\n\n.clampstores data at the nearest surface location (sized appropriately)\n\n.zerodrops stores to out-of-bounds addresses\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nIndirect surface access introduced in PTX ISA version 3.1.\n\n.u64/.s64/.b64 types with .min/.max operations introduced in PTX ISA version\n\n8.1.\n\nTarget ISA Notes\n\nsured requires sm_20 or higher.\n\nIndirect surface access requires sm_20 or higher.\n\n.u64/.s64/.b64 types with .min/.max operations requires sm_50 or higher.\n\nExamples\n\nsured.b.add.2d.u32.trap  [surf_A, {x,y}], r1;\n\nsured.p.min.1d.u32.trap  [surf_B, {x}], r1;\n\nsured.b.max.1d.u64.trap  [surf_C, {x}], r1;\n\nsured.p.min.1d.b64.trap  [surf_D, {x}], r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sured"
            };

        case "sust":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust\" target=\"_blank\" rel=\"noopener noreferrer\">sust <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Surface Instructions: sust</h1><section id=\"surface-instructions-sust\">\n\n\n<p>Store to surface memory.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sust.b.{1d,2d,3d}{.cop}.vec.ctype.clamp  [a, b], c;  // unformatted\nsust.p.{1d,2d,3d}.vec.b32.clamp          [a, b], c;  // formatted\n\nsust.b.{a1d,a2d}{.cop}.vec.ctype.clamp   [a, b], c;  // unformatted\n\n.cop   = { .wb, .cg, .cs, .wt };                     // cache operation\n.vec   = { none, .v2, .v4 };\n.ctype = { .b8 , .b16, .b32, .b64 };\n.clamp = { .trap, .clamp, .zero };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.{1d,2d,3d}</span></code></p>\n<p>Store to surface memory using a surface coordinate vector. The instruction stores data from operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to the surface named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> at coordinates given by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a scalar or singleton tuple for 1d\nsurfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d surfaces,\nwhere the fourth element is ignored. Coordinate elements are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.b</span></code> performs an unformatted store of binary data. The lowest dimension coordinate represents\na byte offset into the surface and is not scaled. The size of the data transfer matches the size of\nsource operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.p</span></code> performs a formatted store of a vector of 32-bit data values to a surface sample. The\nsource vector elements are interpreted left-to-right as <code class=\"docutils literal notranslate\"><span class=\"pre\">R</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">G</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">B</span></code>, and <code class=\"docutils literal notranslate\"><span class=\"pre\">A</span></code> surface\ncomponents. These elements are written to the corresponding surface sample components. Source\nelements that do not occur in the surface sample are ignored. Surface sample components that do not\noccur in the source vector will be written with an unpredictable value. The lowest dimension\ncoordinate represents a sample offset rather than a byte offset.</p>\n<p>The source data interpretation is based on the surface sample format as follows: If the surface\nformat contains <code class=\"docutils literal notranslate\"><span class=\"pre\">UNORM</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">SNORM</span></code>, or <code class=\"docutils literal notranslate\"><span class=\"pre\">FLOAT</span></code> data, then <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> is assumed; if the surface\nformat contains <code class=\"docutils literal notranslate\"><span class=\"pre\">UINT</span></code> data, then <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> is assumed; if the surface format contains <code class=\"docutils literal notranslate\"><span class=\"pre\">SINT</span></code>\ndata, then <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> is assumed. The source data is then converted from this type to the surface\nsample format.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.b.{a1d,a2d}</span></code></p>\n<p>Surface layer selection, followed by an unformatted store to the selected surface. The instruction\nfirst selects a surface layer from the surface array named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> using the index given by\nthe first element of the array coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. The instruction then stores the data in\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> to the selected surface at coordinates given by the remaining elements of operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a .surfref variable or <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is a bit-size type\nvector or tuple containing an index into the array of surfaces followed by coordinates within the\nselected surface, as follows:</p>\n<ul class=\"simple\">\n<li><p>For 1d surface arrays, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.b32</span></code>. The first element is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the surface array, and the second element is interpreted as\na 1d surface coordinate of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>.</p></li>\n<li><p>For 2d surface arrays, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.b32</span></code>. The first element is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the surface array, and the next two elements are\ninterpreted as 2d surface coordinates of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>. The fourth element is ignored.</p></li>\n</ul>\n<p>A surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code> field specifies how to handle out-of-bounds addresses:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.trap</span></code></dt><dd><p>causes an execution trap on out-of-bounds addresses</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code></dt><dd><p>stores data at the nearest surface location (sized appropriately)</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.zero</span></code></dt><dd><p>drops stores to out-of-bounds addresses</p>\n</dd>\n</dl>\n<p><strong>Indirect surface access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n<code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding the address of\na <code class=\"docutils literal notranslate\"><span class=\"pre\">.surfref</span></code> variable.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.b.trap</span></code> introduced in PTX ISA version 1.5.\u00a0 <code class=\"docutils literal notranslate\"><span class=\"pre\">sust.p</span></code>, additional clamp modifiers, and\ncache operations introduced in PTX ISA version 2.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.b.3d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sust.b.{a1d,a2d}</span></code> introduced in PTX ISA version 3.0.</p>\n<p>Indirect surface access introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.b</span></code> supported on all target architectures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_1x</span></code> targets support only the <code class=\"docutils literal notranslate\"><span class=\"pre\">.trap</span></code> clamping modifier.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.3d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sust.{a1d,a2d}</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">sust.p</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Indirect surface access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Cache operations require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>sust.p.1d.v4.b32.trap  [surf_B, {x}], {f1,f2,f3,f4};\nsust.b.3d.v2.b64.trap  [surf_A, {x,y,z,w}], {r1,r2};\nsust.b.a1d.v2.b64      [surf_C, {idx,x}], {r1,r2};\nsust.b.a2d.b32         [surf_D, {idx,x,y,z}], r0;  // z ignored\n</pre></div>\n</div>\n</section>",
                "tooltip": "Store to surface memory.\n\nSyntax\n\nsust.b.{1d,2d,3d}{.cop}.vec.ctype.clamp  [a, b], c;  // unformatted\n\nsust.p.{1d,2d,3d}.vec.b32.clamp          [a, b], c;  // formatted\n\nsust.b.{a1d,a2d}{.cop}.vec.ctype.clamp   [a, b], c;  // unformatted\n\n.cop   = { .wb, .cg, .cs, .wt };                     // cache operation\n\n.vec   = { none, .v2, .v4 };\n\n.ctype = { .b8 , .b16, .b32, .b64 };\n\n.clamp = { .trap, .clamp, .zero };\n\nDescription\n\nsust.{1d,2d,3d}\n\nStore to surface memory using a surface coordinate vector. The instruction stores data from operand\n\nc to the surface named by operand a at coordinates given by operand b. Operand a is\n\na .surfref variable or .u64 register. Operand b is a scalar or singleton tuple for 1d\n\nsurfaces; is a two-element vector for 2d surfaces; and is a four-element vector for 3d surfaces,\n\nwhere the fourth element is ignored. Coordinate elements are of type .s32.\n\nsust.b performs an unformatted store of binary data. The lowest dimension coordinate represents\n\na byte offset into the surface and is not scaled. The size of the data transfer matches the size of\n\nsource operand c.\n\nsust.p performs a formatted store of a vector of 32-bit data values to a surface sample. The\n\nsource vector elements are interpreted left-to-right as R, G, B, and A surface\n\ncomponents. These elements are written to the corresponding surface sample components. Source\n\nelements that do not occur in the surface sample are ignored. Surface sample components that do not\n\noccur in the source vector will be written with an unpredictable value. The lowest dimension\n\ncoordinate represents a sample offset rather than a byte offset.\n\nThe source data interpretation is based on the surface sample format as follows: If the surface\n\nformat contains UNORM, SNORM, or FLOAT data, then .f32 is assumed; if the surface\n\nformat contains UINT data, then .u32 is assumed; if the surface format contains SINT\n\ndata, then .s32 is assumed. The source data is then converted from this type to the surface\n\nsample format.\n\nsust.b.{a1d,a2d}\n\nSurface layer selection, followed by an unformatted store to the selected surface. The instruction\n\nfirst selects a surface layer from the surface array named by operand a using the index given by\n\nthe first element of the array coordinate vector b. The instruction then stores the data in\n\noperand c to the selected surface at coordinates given by the remaining elements of operand\n\nb. Operand a is a .surfref variable or .u64 register. Operand b is a bit-size type\n\nvector or tuple containing an index into the array of surfaces followed by coordinates within the\n\nselected surface, as follows:\n\nFor 1d surface arrays, operand b has type .v2.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the second element is interpreted as\n\na 1d surface coordinate of type .s32.\n\nFor 2d surface arrays, operand b has type .v4.b32. The first element is interpreted as an\n\nunsigned integer index (.u32) into the surface array, and the next two elements are\n\ninterpreted as 2d surface coordinates of type .s32. The fourth element is ignored.\n\nA surface base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\nThe .clamp field specifies how to handle out-of-bounds addresses:\n\n.trapcauses an execution trap on out-of-bounds addresses\n\n.clampstores data at the nearest surface location (sized appropriately)\n\n.zerodrops stores to out-of-bounds addresses\n\nIndirect surface access\n\nBeginning with PTX ISA version 3.1, indirect surface access is supported for target architecture\n\nsm_20 or higher. In indirect access, operand a is a .u64 register holding the address of\n\na .surfref variable.\n\nPTX ISA Notes\n\nsust.b.trap introduced in PTX ISA version 1.5.\u00a0 sust.p ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-sust"
            };

        case "szext":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext\" target=\"_blank\" rel=\"noopener noreferrer\">szext(int) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Integer Arithmetic Instructions: szext</h1><section id=\"integer-arithmetic-instructions-szext\">\n\n\n<p>Sign-extend or Zero-extend.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>szext.mode.type  d, a, b;\n\n.mode = { .clamp, .wrap };\n.type = { .u32, .s32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Sign-extends or zero-extends an N-bit value from operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> where N is specified in operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>. The resulting value is stored in the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p>For the <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> instruction type, the value in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is treated as an N-bit signed value and the\nmost significant bit of this N-bit value is replicated up to bit 31. For the <code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code> instruction\ntype, the value in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is treated as an N-bit unsigned number and is zero-extended to 32\nbits. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is an unsigned 32-bit value.</p>\n<p>If the value of N is 0, then the result of <code class=\"docutils literal notranslate\"><span class=\"pre\">szext</span></code> is 0. If the value of N is 32 or higher, then\nthe result of <code class=\"docutils literal notranslate\"><span class=\"pre\">szext</span></code> depends upon the value of the <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> qualifier as follows:</p>\n<ul class=\"simple\">\n<li><p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.clamp</span></code>, then the result is the same as the source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p></li>\n<li><p>If <code class=\"docutils literal notranslate\"><span class=\"pre\">.mode</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.wrap</span></code>, then the result is computed using the wrapped value of N.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>b1        = b &amp; 0x1f;\ntoo_large = (b &gt;= 32 &amp;&amp; .mode == .clamp) ? true : false;\nmask      = too_large ? 0 : (~0) &lt;&lt; b1;\nsign_pos  = (b1 - 1) &amp; 0x1f;\n\nif (b1 == 0 || too_large || .type != .s32) {\n    sign_bit = false;\n} else {\n    sign_bit = (a &gt;&gt; sign_pos) &amp; 1;\n}\nd = (a &amp; ~mask) | (sign_bit ? mask | 0);\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.6.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">szext</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>szext.clamp.s32 rd, ra, rb;\nszext.wrap.u32  rd, 0xffffffff, 0; // Result is 0.\n</pre></div>\n</div>\n</section>",
                "tooltip": "Sign-extend or Zero-extend.\n\nSyntax\n\nszext.mode.type  d, a, b;\n\n.mode = { .clamp, .wrap };\n\n.type = { .u32, .s32 };\n\nDescription\n\nSign-extends or zero-extends an N-bit value from operand a where N is specified in operand\n\nb. The resulting value is stored in the destination operand d.\n\nFor the .s32 instruction type, the value in a is treated as an N-bit signed value and the\n\nmost significant bit of this N-bit value is replicated up to bit 31. For the .u32 instruction\n\ntype, the value in a is treated as an N-bit unsigned number and is zero-extended to 32\n\nbits. Operand b is an unsigned 32-bit value.\n\nIf the value of N is 0, then the result of szext is 0. If the value of N is 32 or higher, then\n\nthe result of szext depends upon the value of the .mode qualifier as follows:\n\nIf .mode is .clamp, then the result is the same as the source operand a.\n\nIf .mode is .wrap, then the result is computed using the wrapped value of N.\n\nSemantics\n\nb1        = b & 0x1f;\n\ntoo_large = (b >= 32 && .mode == .clamp) ? true : false;\n\nmask      = too_large ? 0 : (~0) << b1;\n\nsign_pos  = (b1 - 1) & 0x1f;\n\nif (b1 == 0 || too_large || .type != .s32) {\n\n    sign_bit = false;\n\n} else {\n\n    sign_bit = (a >> sign_pos) & 1;\n\n}\n\nd = (a & ~mask) | (sign_bit ? mask | 0);\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 7.6.\n\nTarget ISA Notes\n\nszext requires sm_70 or higher.\n\nExamples\n\nszext.clamp.s32 rd, ra, rb;\n\nszext.wrap.u32  rd, 0xffffffff, 0; // Result is 0.\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-szext"
            };

        case "tanh":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh\" target=\"_blank\" rel=\"noopener noreferrer\">tanh(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>, <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-tanh\" target=\"_blank\" rel=\"noopener noreferrer\">tanh(fp16) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: tanh</h1><section id=\"floating-point-instructions-tanh\">\n\n\n<p>Find the hyperbolic tangent of a value (in radians)</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tanh.approx.f32 d, a;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take hyperbolic tangent value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>The operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = tanh(a);\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tanh.approx.f32</span></code> implements a fast approximation to FP32 hyperbolic-tangent.</p>\n<p>Results of <code class=\"docutils literal notranslate\"><span class=\"pre\">tanh</span></code> for various corner-case inputs are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 43%\"/>\n<col style=\"width: 57%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>-1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-subnormal</p></td>\n<td><p>Same as input</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>-0.0</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+0.0</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+subnormal</p></td>\n<td><p>Same as input</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The subnormal numbers are supported.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>The subnormal inputs gets passed through to the output since the value of <code class=\"docutils literal notranslate\"><span class=\"pre\">tanh(x)</span></code> for small\nvalues of <code class=\"docutils literal notranslate\"><span class=\"pre\">x</span></code> is approximately the same as <code class=\"docutils literal notranslate\"><span class=\"pre\">x</span></code>.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tanh.approx.f32 sa, a;\n</pre></div>\n</div>\n</section>\n<h1>Half Precision Floating Point Instructions: tanh</h1><section id=\"half-precision-floating-point-instructions-tanh\">\n\n\n<p>Find the hyperbolic tangent of a value (in radians)</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tanh.approx.type d, a;\n\n.type = {.f16, .f16x2, .bf16, .bf16x2}\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Take hyperbolic tangent value of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>.</p>\n<p>The type of operands <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> are as specified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.type</span></code>.</p>\n<p>For <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code> instruction type, each of the half-word operands are operated in\nparallel and the results are packed appropriately into a <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16x2</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>if (.type == .f16 || .type == .bf16) {\n  d = tanh(a)\n} else if (.type == .f16x2 || .type == .bf16x2) {\n  fA[0] = a[0:15];\n  fA[1] = a[16:31];\n  d[0] = tanh(fA[0])\n  d[1] = tanh(fA[1])\n}\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tanh.approx.{f16,</span> <span class=\"pre\">f16x2,</span> <span class=\"pre\">bf16,</span> <span class=\"pre\">bf16x2}</span></code> implements an approximate hyperbolic tangent in the\ntarget format.</p>\n<p>Results of <code class=\"docutils literal notranslate\"><span class=\"pre\">tanh</span></code> for various corner-case inputs are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 45%\"/>\n<col style=\"width: 55%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Input</p></th>\n<th class=\"head\"><p>Result</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p>-Inf</p></td>\n<td><p>-1.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>-0.0</p></td>\n<td><p>-0.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>+0.0</p></td>\n<td><p>+0.0</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p>+Inf</p></td>\n<td><p>1.0</p></td>\n</tr>\n<tr class=\"row-even\"><td><p>NaN</p></td>\n<td><p>NaN</p></td>\n</tr>\n</tbody>\n</table>\n<p>The maximum absolute error for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> type is 2-10.987. The maximum absolute error for <code class=\"docutils literal notranslate\"><span class=\"pre\">.bf16</span></code>\ntype is 2-8.</p>\n<p>The subnormal numbers are supported.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 7.0.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tanh.approx.{bf16/bf16x2}</span></code> introduced in PTX ISA version 7.8.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tanh.approx.{bf16/bf16x2}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tanh.approx.f16    h1, h0;\ntanh.approx.f16x2  hd1, hd0;\ntanh.approx.bf16   b1, b0;\ntanh.approx.bf16x2 hb1, hb0;\n</pre></div>\n</div>\n</section>",
                "tooltip": "=====Floating Point Instructions: tanh\n\n\n\nFind the hyperbolic tangent of a value (in radians)\n\nSyntax\n\ntanh.approx.f32 d, a;\n\nDescription\n\nTake hyperbolic tangent value of a.\n\nThe operands d and a are of type .f32.\n\nSemantics\n\nd = tanh(a);\n\nNotes\n\ntanh.approx.f32 implements a fast approximation to FP32 hyperbolic-tangent.\n\nResults of tanh for various corner-case inputs are as follows:\n\n\n\nInput\n\nResult\n\n-Inf\n\n-1.0\n\n-subnormal\n\nSame as input\n\n-0.0\n\n-0.0\n\n+0.0\n\n...\n\n=====Half Precision Floating Point Instructions: tanh\n\n\n\nFind the hyperbolic tangent of a value (in radians)\n\nSyntax\n\ntanh.approx.type d, a;\n\n.type = {.f16, .f16x2, .bf16, .bf16x2}\n\nDescription\n\nTake hyperbolic tangent value of a.\n\nThe type of operands d and a are as specified by .type.\n\nFor .f16x2 or .bf16x2 instruction type, each of the half-word operands are operated in\n\nparallel and the results are packed appropriately into a .f16x2 or .bf16x2.\n\nSemantics\n\nif... ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-tanh"
            };

        case "target":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-target\" target=\"_blank\" rel=\"noopener noreferrer\">target <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>PTX Module Directives: .target</h1><section id=\"ptx-module-directives-target\">\n\n\n<p>Architecture and Platform target.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.target stringlist         // comma separated list of target specifiers\nstring = { sm_90a, sm_90,               // sm_9x target architectures\n           sm_80, sm_86, sm_87, sm_89,  // sm_8x target architectures\n           sm_70, sm_72, sm_75,         // sm_7x target architectures\n           sm_60, sm_61, sm_62,         // sm_6x target architectures\n           sm_50, sm_52, sm_53,         // sm_5x target architectures\n           sm_30, sm_32, sm_35, sm_37,  // sm_3x target architectures\n           sm_20,                       // sm_2x target architectures\n           sm_10, sm_11, sm_12, sm_13,  // sm_1x target architectures\n           texmode_unified, texmode_independent,   // texturing mode\n           debug,                                  // platform option\n           map_f64_to_f32 };                       // platform option\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Specifies the set of features in the target architecture for which the current PTX code was\ngenerated. In general, generations of SM architectures follow an <em>onion layer</em> model, where each\ngeneration adds new features and retains all features of previous generations. The onion layer model\nallows the PTX code generated for a given target to be run on later generation devices.</p>\n<p>Target architectures with suffix \u201c<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>\u201d, such as <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code>, include architecture-accelerated\nfeatures that are supported on the specified architecture only, hence such targets do not follow the\nonion layer model. Therefore, PTX code generated for such targets cannot be run on later generation\ndevices. Architecture-accelerated features can only be used with targets that support these\nfeatures.</p>\n<p><strong>Semantics</strong></p>\n<p>Each PTX module must begin with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.version</span></code> directive, immediately followed by a <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code>\ndirective containing a target architecture and optional platform options. A <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> directive\nspecifies a single target architecture, but subsequent <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> directives can be used to change\nthe set of target features allowed during parsing. A program with multiple <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> directives\nwill compile and run only on devices that support all features of the highest-numbered architecture\nlisted in the program.</p>\n<p>PTX features are checked against the specified target architecture, and an error is generated if an\nunsupported feature is used.\u00a0 The following table summarizes the features in PTX that vary according\nto target architecture.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 13%\"/>\n<col style=\"width: 88%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code></p></td>\n<td><p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code> accelerated <code class=\"docutils literal notranslate\"><span class=\"pre\">wgmma</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">setmaxnreg</span></code> instructions.</p></td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 13%\"/>\n<col style=\"width: 87%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code></p></td>\n<td><p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">.xorsign</span></code> modifier on <code class=\"docutils literal notranslate\"><span class=\"pre\">min</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">max</span></code> instructions.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_87</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> architecture.</p></td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 11%\"/>\n<col style=\"width: 89%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_72</span></code></p></td>\n<td><p>Adds support for integer multiplicand and accumulator matrices in <code class=\"docutils literal notranslate\"><span class=\"pre\">wmma</span></code> instructions.</p>\n<p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt.pack</span></code> instruction.</p>\n</td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code></p></td>\n<td><p>Adds support for sub-byte integer and single-bit multiplicant matrices in <code class=\"docutils literal notranslate\"><span class=\"pre\">wmma</span></code> instructions.</p>\n<p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">ldmatrix</span></code> instruction.</p>\n<p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">movmatrix</span></code> instruction.</p>\n<p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">tanh</span></code> instruction.</p>\n</td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 18%\"/>\n<col style=\"width: 82%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code></p></td>\n<td><p>Adds support for <code class=\"docutils literal notranslate\"><span class=\"pre\">dp2a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">dp4a</span></code> instructions.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_62</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code> architecture.</p></td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 11%\"/>\n<col style=\"width: 89%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_52</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code></p></td>\n<td><p>Adds support for arithmetic, comparsion and texture instructions for <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> types.</p></td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 17%\"/>\n<col style=\"width: 83%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> architecture.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code></p></td>\n<td><p>Adds 64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">{atom,red}.{and,or,xor,min,max}</span></code>\ninstructions.</p>\n<p>Adds <code class=\"docutils literal notranslate\"><span class=\"pre\">shf</span></code> instruction.</p>\n<p>Adds <code class=\"docutils literal notranslate\"><span class=\"pre\">ld.global.nc</span></code> instruction.</p>\n</td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_35</span></code></p></td>\n<td><p>Adds support for CUDA Dynamic Parallelism.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_37</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_35</span></code> architecture.</p></td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 19%\"/>\n<col style=\"width: 81%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> architecture.</p></td>\n</tr>\n</tbody>\n</table>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 14%\"/>\n<col style=\"width: 86%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target</p></th>\n<th class=\"head\"><p>Description</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_10</span></code></p></td>\n<td><p>Baseline feature set for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_10</span></code> architecture.</p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">map_f64_to_f32</span></code> if any <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> instructions used.</p>\n</td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code></p></td>\n<td><p>Adds 64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">{atom,red}.{and,or,xor,min,max}</span></code> instructions.</p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">map_f64_to_f32</span></code> if any <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> instructions used.</p>\n</td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code></p></td>\n<td><p>Adds <code class=\"docutils literal notranslate\"><span class=\"pre\">{atom,red}.shared</span></code>, 64-bit <code class=\"docutils literal notranslate\"><span class=\"pre\">{atom,red}.global</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vote</span></code>\ninstructions.</p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">map_f64_to_f32</span></code> if any <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> instructions used.</p>\n</td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code></p></td>\n<td><p>Adds double-precision support, including expanded rounding modifiers.</p>\n<p>Disallows use of <code class=\"docutils literal notranslate\"><span class=\"pre\">map_f64_to_f32</span></code>.</p>\n</td>\n</tr>\n</tbody>\n</table>\n<p>The texturing mode is specified for an entire module and cannot be changed within the module.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> debug option declares that the PTX file contains DWARF debug information, and\nsubsequent compilation of PTX will retain information needed for source-level debugging. If the\ndebug option is declared, an error message is generated if no DWARF information is found in the\nfile. The debug option requires PTX ISA version 3.0 or later.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">map_f64_to_f32</span></code> indicates that all double-precision instructions map to single-precision\nregardless of the target architecture. This enables high-level language compilers to compile\nprograms containing type double to target device that do not support double-precision\noperations. Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> storage remains as 64-bits, with only half being used by instructions\nconverted from <code class=\"docutils literal notranslate\"><span class=\"pre\">.f64</span></code> to <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>.</p>\n<p><strong>Notes</strong></p>\n<p>Targets of the form <code class=\"docutils literal notranslate\"><span class=\"pre\">compute_xx</span></code> are also accepted as synonyms for <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_xx</span></code> targets.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p>Target strings <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_10</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_11</span></code> introduced in PTX ISA version 1.0.</p>\n<p>Target strings <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_12</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_13</span></code> introduced in PTX ISA version 1.2.</p>\n<p>Texturing mode introduced in PTX ISA version 1.5.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> introduced in PTX ISA version 2.0.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> introduced in PTX ISA version 3.0.</p>\n<p>Platform option <code class=\"docutils literal notranslate\"><span class=\"pre\">debug</span></code> introduced in PTX ISA version 3.0.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_35</span></code> introduced in PTX ISA version 3.1.</p>\n<p>Target strings <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_32</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_50</span></code> introduced in PTX ISA version 4.0.</p>\n<p>Target strings <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_37</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_52</span></code> introduced in PTX ISA version 4.1.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> introduced in PTX ISA version 4.2.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_61</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_62</span></code> introduced in PTX ISA version 5.0.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_70</span></code> introduced in PTX ISA version 6.0.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_72</span></code> introduced in PTX ISA version 6.1.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_75</span></code> introduced in PTX ISA version 6.3.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_80</span></code> introduced in PTX ISA version 7.0.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_86</span></code> introduced in PTX ISA version 7.1.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_87</span></code> introduced in PTX ISA version 7.4.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_89</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90</span></code> introduced in PTX ISA version 7.8.</p>\n<p>Target string <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_90a</span></code> introduced in PTX ISA version 8.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.target</span></code> directive is supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.target sm_10       // baseline target architecture\n.target sm_13       // supports double-precision\n.target sm_20, texmode_independent\n.target sm_90       // baseline target architecture\n.target sm_90a      // PTX using arch accelerated features\n</pre></div>\n</div>\n</section>",
                "tooltip": "Architecture and Platform target.\n\nSyntax\n\n.target stringlist         // comma separated list of target specifiers\n\nstring = { sm_90a, sm_90,               // sm_9x target architectures\n\n           sm_80, sm_86, sm_87, sm_89,  // sm_8x target architectures\n\n           sm_70, sm_72, sm_75,         // sm_7x target architectures\n\n           sm_60, sm_61, sm_62,         // sm_6x target architectures\n\n           sm_50, sm_52, sm_53,         // sm_5x target architectures\n\n           sm_30, sm_32, sm_35, sm_37,  // sm_3x target architectures\n\n           sm_20,                       // sm_2x target architectures\n\n           sm_10, sm_11, sm_12, sm_13,  // sm_1x target architectures\n\n           texmode_unified, texmode_independent,   // texturing mode\n\n           debug,                                  // platform option\n\n           map_f64_to_f32 };                       // platform option\n\nDescription\n\nSpecifies the set of features in the target architecture for which the current PTX code was\n\ngenerated. In general, generations of SM architectures follow an onion layer model, where each\n\ngeneration adds new features and retains all features of previous generations. The onion layer model\n\nallows the PTX code generated for a given target to be run on later generation devices.\n\nTarget architectures with suffix \u201ca\u201d, such as sm_90a, include architecture-accelerated\n\nfeatures that are supported on the specified architecture only, hence such targets do not follow the\n\nonion layer model. Therefore, PTX code generated for such targets cannot be run on later generation\n\ndevices. Architecture-accelerated features can only be used with targets that support these\n\nfeatures.\n\nSemantics\n\nEach PTX module must begin with a .version directive, immediately followed by a .target\n\ndirective containing a target architecture and optional platform options. A .target directive\n\nspecifies a single target architecture, but subsequent .target directives can be used to change\n\nthe set of target features allowed during parsing. A program with multiple .target directives\n\nwill compile and run only on devices that support all features of the highest-numbered architecture\n\nlisted in the program.\n\nPTX features are checked against the specified target architecture, and an error is generated if an\n\nunsupported feature is used.\u00a0 The following table summarizes the features in PTX that vary according\n\nto target architecture.\n\n\n\nTarget\n\nDescription\n\nsm_90\n\nBaseline feature set for sm_90 architecture.\n\nsm_90a\n\nAdds support for sm_90a accelerated wgmma and setmaxnreg instructions.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_80\n\nBaseline feature set for sm_80 architecture.\n\nsm_86\n\nAdds support for .xorsign modifier on min and max instructions.\n\nsm_87\n\nBaseline feature set for sm_86 architecture.\n\nsm_89\n\nBaseline feature set for sm_86 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_70\n\nBaseline feature set for sm_70 architecture.\n\nsm_72\n\nAdds support for integer multiplicand and accumulator matrices in wmma instructions.\n\nAdds support for cvt.pack instruction.\n\nsm_75\n\nAdds support for sub-byte integer and single-bit multiplicant matrices in wmma instructions.\n\nAdds support for ldmatrix instruction.\n\nAdds support for movmatrix instruction.\n\nAdds support for tanh instruction.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_60\n\nBaseline feature set for sm_60 architecture.\n\nsm_61\n\nAdds support for dp2a and dp4a instructions.\n\nsm_62\n\nBaseline feature set for sm_61 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_50\n\nBaseline feature set for sm_50 architecture.\n\nsm_52\n\nBaseline feature set for sm_50 architecture.\n\nsm_53\n\nAdds support for arithmetic, comparsion and texture instructions for .f16 and .f16x2 types.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_30\n\nBaseline feature set for sm_30 architecture.\n\nsm_32\n\nAdds 64-bit {atom,red}.{and,or,xor,min,max}\n\ninstructions.\n\nAdds shf instruction.\n\nAdds ld.global.nc instruction.\n\nsm_35\n\nAdds support for CUDA Dynamic Parallelism.\n\nsm_37\n\nBaseline feature set for sm_35 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_20\n\nBaseline feature set for sm_20 architecture.\n\n\n\n\n\nTarget\n\nDescription\n\nsm_10\n\nBaseline feature se ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-target"
            };

        case "testp":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp\" target=\"_blank\" rel=\"noopener noreferrer\">testp(fp) <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Floating Point Instructions: testp</h1><section id=\"floating-point-instructions-testp\">\n\n\n<p>Test floating-point property.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>testp.op.type  p, a;  // result is .pred\n\n.op   = { .finite, .infinite,\n          .number, .notanumber,\n          .normal, .subnormal };\n.type = { .f32, .f64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">testp</span></code> tests common properties of floating-point numbers and returns a predicate value of <code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code>\nif <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> if <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>.</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">testp.finite</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the input is not infinite or <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code></p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">testp.infinite</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the input is positive or negative infinity</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">testp.number</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the input is not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code></p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">testp.notanumber</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the input is <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code></p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">testp.normal</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the input is a normal number (not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, not infinity)</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">testp.subnormal</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if the input is a subnormal number (not <code class=\"docutils literal notranslate\"><span class=\"pre\">NaN</span></code>, not infinity)</p>\n</dd>\n</dl>\n<p>As a special case, positive and negative zero are considered normal numbers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>testp.notanumber.f32  isnan, f0;\ntestp.infinite.f64    p, X;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Test floating-point property.\n\nSyntax\n\ntestp.op.type  p, a;  // result is .pred\n\n.op   = { .finite, .infinite,\n\n          .number, .notanumber,\n\n          .normal, .subnormal };\n\n.type = { .f32, .f64 };\n\nDescription\n\ntestp tests common properties of floating-point numbers and returns a predicate value of 1\n\nif True and 0 if False.\n\ntestp.finiteTrue if the input is not infinite or NaN\n\ntestp.infiniteTrue if the input is positive or negative infinity\n\ntestp.numberTrue if the input is not NaN\n\ntestp.notanumberTrue if the input is NaN\n\ntestp.normalTrue if the input is a normal number (not NaN, not infinity)\n\ntestp.subnormalTrue if the input is a subnormal number (not NaN, not infinity)\n\nAs a special case, positive and negative zero are considered normal numbers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\ntestp.notanumber.f32  isnan, f0;\n\ntestp.infinite.f64    p, X;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-testp"
            };

        case "tex":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex\" target=\"_blank\" rel=\"noopener noreferrer\">tex <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Texture Instructions: tex</h1><section id=\"texture-instructions-tex\">\n\n\n<p>Perform a texture memory lookup.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tex.geom.v4.dtype.ctype  d, [a, c] {, e} {, f};\ntex.geom.v4.dtype.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\ntex.geom.v2.f16x2.ctype  d[|p], [a, c] {, e} {, f};\ntex.geom.v2.f16x2.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\n// mipmaps\ntex.base.geom.v4.dtype.ctype   d[|p], [a, {b,} c] {, e} {, f};\ntex.level.geom.v4.dtype.ctype  d[|p], [a, {b,} c], lod {, e} {, f};\ntex.grad.geom.v4.dtype.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\ntex.base.geom.v2.f16x2.ctype   d[|p], [a, {b,} c] {, e} {, f};\ntex.level.geom.v2.f16x2.ctype  d[|p], [a, {b,} c], lod {, e} {, f};\ntex.grad.geom.v2.f16x2.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\n.geom  = { .1d, .2d, .3d, .a1d, .a2d, .cube, .acube, .2dms, .a2dms };\n.dtype = { .u32, .s32, .f16,  .f32 };\n.ctype = {       .s32, .f32 };          // .cube, .acube require .f32\n                                        // .2dms, .a2dms require .s32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.{1d,2d,3d}</span></code></p>\n<p>Texture lookup using a texture coordinate vector. The instruction loads data from the texture named\nby operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> at coordinates given by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is a\nscalar or singleton tuple for 1d textures; is a two-element vector for 2d textures; and is a\nfour-element vector for 3d textures, where the fourth element is ignored. An optional texture\nsampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the sampler behavior is a property of\nthe named texture. The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture\nat specified coordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination\npredicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data\nat specified coordinates is dependent on execution environment setup using Driver API calls, prior\nto kernel launch. Refer to Driver API documentation for more details including any\nsystem/implementation specific behavior.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> values that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\nvalue is in the range of -8 to +7. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a singleton tuple for 1d textures; is a two\nelement vector 2d textures; and is four-element vector for 3d textures, where the fourth element is\nignored.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> may be specified for <code class=\"docutils literal notranslate\"><span class=\"pre\">depth</span> <span class=\"pre\">textures</span></code>. Depth textures are special type\nof textures which hold data from the depth buffer. Depth buffer contains depth information of each\npixel. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> scalar value that specifies depth compare value for depth\ntextures. Each element fetched from texture is compared against value given in <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> operand. If\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\nused for the filtering. When using depth compare operand, the elements in texture coordinate vector\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> type.</p>\n<p>Depth compare operand is not supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">3d</span></code> textures.</p>\n<p>The instruction returns a two-element vector for destination type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>. For all other\ndestination types, the instruction returns a four-element vector. Coordinates may be given in either\nsigned 32-bit integer or 32-bit floating point form.</p>\n<p>A texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.{a1d,a2d}</span></code></p>\n<p>Texture array selection, followed by texture lookup. The instruction first selects a texture from\nthe texture array named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> using the index given by the first element of the array\ncoordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>. The instruction then loads data from the selected texture at coordinates\ngiven by the remaining elements of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is a bit-size\ntype vector or tuple containing an index into the array of textures followed by coordinates within\nthe selected texture, as follows:</p>\n<ul class=\"simple\">\n<li><p>For 1d texture arrays, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.b32</span></code>. The first element is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the texture array, and the second element is interpreted as\na 1d texture coordinate of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.ctype</span></code>.</p></li>\n<li><p>For 2d texture arrays, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.b32</span></code>. The first element is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the texture array, and the next two elements are\ninterpreted as 2d texture coordinates of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.ctype</span></code>. The fourth element is ignored.</p></li>\n</ul>\n<p>An optional texture sampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> values that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\nvalue is in the range of -8 to +7. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a singleton tuple for 1d texture arrays; and is\na two element vector 2d texture arrays.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> may be specified for depth textures arrays. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>\nscalar value that specifies depth compare value for depth textures. When using depth compare\noperand, the coordinates in texture coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> type.</p>\n<p>The instruction returns a two-element vector for destination type <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code>. For all other\ndestination types, the instruction returns a four-element vector. The texture array index is a\n32-bit unsigned integer, and texture coordinate elements are 32-bit signed integer or floating point\nvalues.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.cube</span></code></p>\n<p><em>Cubemap</em> texture lookup. The instruction loads data from the cubemap texture named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code>\nat coordinates given by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. Cubemap textures are special\ntwo-dimensional layered textures consisting of six layers that represent the faces of a cube. All\nlayers in a cubemap are of the same size and are square (i.e., width equals height).</p>\n<p>When accessing a cubemap, the texture coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.f32</span></code>, and comprises\nthree floating-point coordinates (<code class=\"docutils literal notranslate\"><span class=\"pre\">s</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">r</span></code>) and a fourth padding argument which is\nignored. Coordinates (<code class=\"docutils literal notranslate\"><span class=\"pre\">s</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">r</span></code>) are projected onto one of the six cube faces. The (<code class=\"docutils literal notranslate\"><span class=\"pre\">s</span></code>,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">r</span></code>) coordinates can be thought of as a direction vector emanating from the center of the\ncube. Of the three coordinates (<code class=\"docutils literal notranslate\"><span class=\"pre\">s</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">r</span></code>), the coordinate of the largest magnitude (the\nmajor axis) selects the cube face. Then, the other two coordinates (the minor axes) are divided by\nthe absolute value of the major axis to produce a new (<code class=\"docutils literal notranslate\"><span class=\"pre\">s</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>) coordinate pair to lookup into\nthe selected cube face.</p>\n<p>An optional texture sampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.</p>\n<p>Offset vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is not supported for cubemap textures.</p>\n<p>an optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> may be specified for cubemap depth textures. operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>\nscalar value that specifies depth compare value for cubemap depth textures.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.acube</span></code></p>\n<p>Cubemap array selection, followed by cubemap lookup. The instruction first selects a cubemap texture\nfrom the cubemap array named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> using the index given by the first element of the\narray coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>. The instruction then loads data from the selected cubemap texture at\ncoordinates given by the remaining elements of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p>\n<p><em>Cubemap array</em> textures consist of an array of cubemaps, i.e., the total number of layers is a\nmultiple of six. When accessing a cubemap array texture, the coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.b32</span></code>. The first element is interpreted as an unsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the\ncubemap array, and the remaining three elements are interpreted as floating-point cubemap\ncoordinates (<code class=\"docutils literal notranslate\"><span class=\"pre\">s</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">t</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">r</span></code>), used to lookup in the selected cubemap as described above.</p>\n<p>An optional texture sampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.</p>\n<p>Offset vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is not supported for cubemap texture arrays.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> may be specified for cubemap depth texture arrays. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> scalar value that specifies depth compare value for cubemap depth textures.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.2dms</span></code></p>\n<p>Multi-sample texture lookup using a texture coordinate vector. Multi-sample textures consist of\nmultiple samples per data element. The instruction loads data from the texture named by operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> from sample number given by first element of the operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, at coordinates given by\nremaining elements of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. When accessing a multi-sample texture,\ntexture coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.b32</span></code>. The first element in operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is\ninterpreted as unsigned integer sample number (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>), and the next two elements are interpreted\nas signed integer (<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>) 2d texture coordinates. The fourth element is ignored. An optional\ntexture sampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the sampler behavior is a\nproperty of the named texture.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.s32</span></code> that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\nvalue is in the range of -8 to +7.</p>\n<p>Depth compare operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is not supported for multi-sample textures.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.a2dms</span></code></p>\n<p>Multi-sample texture array selection, followed by multi-sample texture lookup. The instruction first\nselects a multi-sample texture from the multi-sample texture array named by operand a using the\nindex given by the first element of the array coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>. The instruction then loads\ndata from the selected multi-sample texture from sample number given by second element of the\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, at coordinates given by remaining elements of operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. When accessing a multi-sample texture array, texture coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has type\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.b32</span></code>. The first element in operand c is interpreted as unsigned integer sampler number, the\nsecond element is interpreted as unsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the multi-sample texture\narray and the next two elements are interpreted as signed integer (<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code>) 2d texture\ncoordinates. An optional texture sampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the\nsampler behavior is a property of the named texture.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.s32</span></code> values\nthat specifies coordinate offset. Offset is applied to coordinates before doing texture\nlookup. Offset value is in the range of -8 to +7.</p>\n<p>Depth compare operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is not supported for multi-sample texture arrays.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p><strong>Mipmaps</strong></p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.base</span></code> (lod zero)</dt><dd><p>Pick level 0 (base level). This is the default if no mipmap mode is specified. No additional arguments.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.level</span></code> (lod explicit)</dt><dd><p>Requires an additional 32-bit scalar argument, <code class=\"docutils literal notranslate\"><span class=\"pre\">lod</span></code>, which contains the LOD to fetch from. The\ntype of <code class=\"docutils literal notranslate\"><span class=\"pre\">lod</span></code> follows <code class=\"docutils literal notranslate\"><span class=\"pre\">.ctype</span></code> (either <code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code>). Geometries <code class=\"docutils literal notranslate\"><span class=\"pre\">.2dms</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.a2dms</span></code> are not supported in this mode.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.grad</span></code> (lod gradient)</dt><dd><p>Requires two <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> vectors, <code class=\"docutils literal notranslate\"><span class=\"pre\">dPdx</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">dPdy</span></code>, that specify the partials. The vectors are\nsingletons for 1d and a1d textures; are two-element vectors for 2d and a2d textures; and are\nfour-element vectors for 3d, cube and acube textures, where the fourth element is ignored for 3d\nand cube geometries. Geometries <code class=\"docutils literal notranslate\"><span class=\"pre\">.2dms</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.a2dms</span></code> are not supported in this mode.</p>\n</dd>\n</dl>\n<p>For mipmap texture lookup, an optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.s32</span></code> that specifies coordinate offset. Offset is applied to coordinates before doing texture\nlookup. Offset value is in the range of -8 to +7. Offset vector operand is not supported for cube\nand cubemap geometries.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> may be specified for mipmap textures. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> scalar\nvalue that specifies depth compare value for depth textures. When using depth compare operand, the\ncoordinates in texture coordinate vector <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> have <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> type.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p>Depth compare operand is not supported for <code class=\"docutils literal notranslate\"><span class=\"pre\">3d</span></code> textures.</p>\n<p><strong>Indirect texture access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect texture access is supported in unified mode for target\narchitecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding\nthe address of a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> variable.</p>\n<p><strong>Notes</strong></p>\n<p>For compatibility with prior versions of PTX, the square brackets are not required and <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4</span></code>\ncoordinate vectors are allowed for any geometry, with the extra elements being ignored.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Unified mode texturing introduced in PTX ISA version 1.0. Extension using opaque <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> and\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code> types and independent mode texturing introduced in PTX ISA version 1.5.</p>\n<p>Texture arrays <code class=\"docutils literal notranslate\"><span class=\"pre\">tex.{a1d,a2d}</span></code> introduced in PTX ISA version 2.3.</p>\n<p>Cubemaps and cubemap arrays introduced in PTX ISA version 3.0.</p>\n<p>Support for mipmaps introduced in PTX ISA version 3.1.</p>\n<p>Indirect texture access introduced in PTX ISA version 3.1.</p>\n<p>Multi-sample textures and multi-sample texture arrays introduced in PTX ISA version 3.2.</p>\n<p>Support for textures returning <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data introduced in PTX ISA version 4.2.</p>\n<p>Support for <code class=\"docutils literal notranslate\"><span class=\"pre\">tex.grad.{cube,</span> <span class=\"pre\">acube}</span></code> introduced in PTX ISA version 4.3.</p>\n<p>Offset vector operand introduced in PTX ISA version 4.3.</p>\n<p>Depth compare operand introduced in PTX ISA version 4.3.</p>\n<p>Support for optional destination predicate introduced in PTX ISA version 7.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p>The cubemap array geometry (<code class=\"docutils literal notranslate\"><span class=\"pre\">.acube</span></code>) requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Mipmaps require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Indirect texture access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Multi-sample textures and multi-sample texture arrays require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Texture fetch returning <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">.f16x2</span></code> data require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_53</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tex.grad.{cube,</span> <span class=\"pre\">acube}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Offset vector operand requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Depth compare operand requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Support for optional destination predicate requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span> // Example of unified mode texturing\n // - f4 is required to pad four-element tuple and is ignored\n tex.3d.v4.s32.s32  {r1,r2,r3,r4}, [tex_a,{f1,f2,f3,f4}];\n\n // Example of independent mode texturing\n tex.1d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,smpl_x,{f1}];\n\n // Example of 1D texture array, independent texturing mode\n tex.a1d.v4.s32.s32 {r1,r2,r3,r4}, [tex_a,smpl_x,{idx,s1}];\n\n // Example of 2D texture array, unified texturing mode\n // - f3 is required to pad four-element tuple and is ignored\n tex.a2d.v4.s32.f32 {r1,r2,r3,r4}, [tex_a,{idx,f1,f2,f3}];\n\n // Example of cubemap array, unified textureing mode\n tex.acube.v4.f32.f32 {r0,r1,r2,r3}, [tex_cuarray,{idx,f1,f2,f3}];\n\n // Example of multi-sample texture, unified texturing mode\n tex.2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ms,{sample,r6,r7,r8}];\n\n // Example of multi-sample texture, independent texturing mode\n tex.2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ms, smpl_x,{sample,r6,r7,r8}];\n\n // Example of multi-sample texture array, unified texturing mode\n tex.a2dms.v4.s32.s32 {r0,r1,r2,r3}, [tex_ams,{idx,sample,r6,r7}];\n\n // Example of texture returning .f16 data\n tex.1d.v4.f16.f32  {h1,h2,h3,h4}, [tex_a,smpl_x,{f1}];\n\n // Example of texture returning .f16x2 data\n tex.1d.v2.f16x2.f32  {h1,h2}, [tex_a,smpl_x,{f1}];\n\n // Example of 3d texture array access with tex.grad,unified texturing mode\n tex.grad.3d.v4.f32.f32 {%f4,%f5,%f6,%f7},[tex_3d,{%f0,%f0,%f0,%f0}],\n                 {fl0,fl1,fl2,fl3},{fl0,fl1,fl2,fl3};\n\n// Example of cube texture array access with tex.grad,unified texturing mode\n tex.grad.cube.v4.f32.f32{%f4,%f5,%f6,%f7},[tex_cube,{%f0,%f0,%f0,%f0}],\n                 {fl0,fl1,fl2,fl3},{fl0,fl1,fl2,fl3};\n\n // Example of 1d texture lookup with offset, unified texturing mode\n tex.1d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a, {f1}], {r5};\n\n // Example of 2d texture array lookup with offset, unified texturing mode\n tex.a2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{idx,f1,f2}], {f5,f6};\n\n // Example of 2d mipmap texture lookup with offset, unified texturing mode\n tex.level.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}],\n                          flvl, {r7, r8};\n\n // Example of 2d depth texture lookup with compare, unified texturing mode\n tex.1d.v4.f32.f32  {f1,f2,f3,f4}, [tex_a, {f1}], f0;\n\n // Example of depth 2d texture array lookup with offset, compare\n tex.a2d.v4.s32.f32  {f0,f1,f2,f3}, [tex_a,{idx,f4,f5}], {r5,r6}, f6;\n\n // Example of destination predicate use\n tex.3d.v4.s32.s32 {r1,r2,r3,r4}|p, [tex_a,{f1,f2,f3,f4}];\n</pre></div>\n</div>\n</section>",
                "tooltip": "Perform a texture memory lookup.\n\nSyntax\n\ntex.geom.v4.dtype.ctype  d, [a, c] {, e} {, f};\n\ntex.geom.v4.dtype.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\ntex.geom.v2.f16x2.ctype  d[|p], [a, c] {, e} {, f};\n\ntex.geom.v2.f16x2.ctype  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\n// mipmaps\n\ntex.base.geom.v4.dtype.ctype   d[|p], [a, {b,} c] {, e} {, f};\n\ntex.level.geom.v4.dtype.ctype  d[|p], [a, {b,} c], lod {, e} {, f};\n\ntex.grad.geom.v4.dtype.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\ntex.base.geom.v2.f16x2.ctype   d[|p], [a, {b,} c] {, e} {, f};\n\ntex.level.geom.v2.f16x2.ctype  d[|p], [a, {b,} c], lod {, e} {, f};\n\ntex.grad.geom.v2.f16x2.ctype   d[|p], [a, {b,} c], dPdx, dPdy {, e} {, f};\n\n.geom  = { .1d, .2d, .3d, .a1d, .a2d, .cube, .acube, .2dms, .a2dms };\n\n.dtype = { .u32, .s32, .f16,  .f32 };\n\n.ctype = {       .s32, .f32 };          // .cube, .acube require .f32\n\n                                        // .2dms, .a2dms require .s32\n\nDescription\n\ntex.{1d,2d,3d}\n\nTexture lookup using a texture coordinate vector. The instruction loads data from the texture named\n\nby operand a at coordinates given by operand c into destination d. Operand c is a\n\nscalar or singleton tuple for 1d textures; is a two-element vector for 2d textures; and is a\n\nfour-element vector for 3d textures, where the fourth element is ignored. An optional texture\n\nsampler b may be specified. If no sampler is specified, the sampler behavior is a property of\n\nthe named texture. The optional destination predicate p is set to True if data from texture\n\nat specified coordinates is resident in memory, False otherwise. When optional destination\n\npredicate p is set to False, data loaded will be all zeros. Memory residency of Texture Data\n\nat specified coordinates is dependent on execution environment setup using Driver API calls, prior\n\nto kernel launch. Refer to Driver API documentation for more details including any\n\nsystem/implementation specific behavior.\n\nAn optional operand e may be specified. Operand e is a vector of .s32 values that\n\nspecifies coordinate offset. Offset is applied to coordinates before doing texture lookup. Offset\n\nvalue is in the range of -8 to +7. Operand e is a singleton tuple for 1d textures; is a two\n\nelement vector 2d textures; and is four-element vector for 3d textures, where the fourth element is\n\nignored.\n\nAn optional operand f may be specified for depth textures. Depth textures are special type\n\nof textures which hold data from the depth buffer. Depth buffer contains depth information of each\n\npixel. Operand f is .f32 scalar value that specifies depth compare value for depth\n\ntextures. Each element fetched from texture is compared against value given in f operand. If\n\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\n\nused for the filtering. When using depth compare operand, the elements in texture coordinate vector\n\nc have .f32 type.\n\nDepth compare operand is not supported for 3d textures.\n\nThe instruction returns a two-element vector for destination type .f16x2. For all other\n\ndestination types, the instruction returns a four-element vector. Coordinates may be given in either\n\nsigned 32-bit integer or 32-bit floating point form.\n\nA texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\ntex.{a1d,a2d}\n\nTexture array selection, followed by texture lookup. The instruction first selects a texture from\n\nthe texture array named by operand a using the index given by the first element of the array\n\ncoordinate vector c. The instruction then loads data from the selected texture at coordinates\n\ngiven by the remaining elements of operand c into destination d. Operand c is a bit-size\n\ntype vect ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex"
            };

        case "tid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-tid\" target=\"_blank\" rel=\"noopener noreferrer\">tid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %tid</h1><section id=\"special-registers-tid\">\n\n\n<p>Thread identifier within a CTA.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .v4 .u32 %tid;                  // thread id vector\n.sreg .u32 %tid.x, %tid.y, %tid.z;    // thread id components\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only, per-thread special register initialized with the thread identifier within\nthe CTA. The <code class=\"docutils literal notranslate\"><span class=\"pre\">%tid</span></code> special register contains a 1D, 2D, or 3D vector to match the CTA shape; the\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%tid</span></code> value in unused dimensions is <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code>. The fourth element is unused and always returns\nzero. The number of threads in each dimension are specified by the predefined special register\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid</span></code>.</p>\n<p>Every thread in the CTA has a unique <code class=\"docutils literal notranslate\"><span class=\"pre\">%tid</span></code>.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%tid</span></code> component values range from <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> through <code class=\"docutils literal notranslate\"><span class=\"pre\">%ntid-1</span></code> in each CTA dimension.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">%tid.y</span> <span class=\"pre\">==</span> <span class=\"pre\">%tid.z</span> <span class=\"pre\">==</span> <span class=\"pre\">0</span></code> in 1D CTAs. <code class=\"docutils literal notranslate\"><span class=\"pre\">%tid.z</span> <span class=\"pre\">==</span> <span class=\"pre\">0</span></code> in 2D CTAs.</p>\n<p>It is guaranteed that:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>0  &lt;=  %tid.x &lt;  %ntid.x\n0  &lt;=  %tid.y &lt;  %ntid.y\n0  &lt;=  %tid.z &lt;  %ntid.z\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0 with type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u16</span></code>.</p>\n<p>Redefined as type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v4.u32</span></code> in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n<code class=\"docutils literal notranslate\"><span class=\"pre\">mov</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">cvt</span></code> instructions may be used to read the lower 16-bits of each component of\n<code class=\"docutils literal notranslate\"><span class=\"pre\">%tid</span></code>.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32      %r1,%tid.x;  // move tid.x to %rh\n\n// legacy code accessing 16-bit components of %tid\nmov.u16      %rh,%tid.x;\ncvt.u32.u16  %r2,%tid.z;  // zero-extend tid.z to %r2\n</pre></div>\n</div>\n</section>",
                "tooltip": "Thread identifier within a CTA.\n\nSyntax (predefined)\n\n.sreg .v4 .u32 %tid;                  // thread id vector\n\n.sreg .u32 %tid.x, %tid.y, %tid.z;    // thread id components\n\nDescription\n\nA predefined, read-only, per-thread special register initialized with the thread identifier within\n\nthe CTA. The %tid special register contains a 1D, 2D, or 3D vector to match the CTA shape; the\n\n%tid value in unused dimensions is 0. The fourth element is unused and always returns\n\nzero. The number of threads in each dimension are specified by the predefined special register\n\n%ntid.\n\nEvery thread in the CTA has a unique %tid.\n\n%tid component values range from 0 through %ntid-1 in each CTA dimension.\n\n%tid.y == %tid.z == 0 in 1D CTAs. %tid.z == 0 in 2D CTAs.\n\nIt is guaranteed that:\n\n0  <=  %tid.x <  %ntid.x\n\n0  <=  %tid.y <  %ntid.y\n\n0  <=  %tid.z <  %ntid.z\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0 with type .v4.u16.\n\nRedefined as type .v4.u32 in PTX ISA version 2.0. For compatibility with legacy PTX code, 16-bit\n\nmov and cvt instructions may be used to read the lower 16-bits of each component of\n\n%tid.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32      %r1,%tid.x;  // move tid.x to %rh\n\n// legacy code accessing 16-bit components of %tid\n\nmov.u16      %rh,%tid.x;\n\ncvt.u32.u16  %r2,%tid.z;  // zero-extend tid.z to %r2\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-tid"
            };

        case "tld4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4\" target=\"_blank\" rel=\"noopener noreferrer\">tld4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Texture Instructions: tld4</h1><section id=\"texture-instructions-tld4\">\n\n\n<p>Perform a texture fetch of the 4-texel bilerp footprint.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>tld4.comp.2d.v4.dtype.f32    d[|p], [a, c] {, e} {, f};\ntld4.comp.geom.v4.dtype.f32  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\n.comp  = { .r, .g, .b, .a };\n.geom  = { .2d, .a2d, .cube, .acube };\n.dtype = { .u32, .s32, .f32 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Texture fetch of the 4-texel bilerp footprint using a texture coordinate vector. The instruction\nloads the bilerp footprint from the texture named by operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> at coordinates given by operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> into vector destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. The texture component fetched for each texel sample is\nspecified by <code class=\"docutils literal notranslate\"><span class=\"pre\">.comp</span></code>. The four texel samples are placed into destination vector <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> in\ncounter-clockwise order starting at lower left.</p>\n<p>An optional texture sampler <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> may be specified. If no sampler is specified, the sampler behavior\nis a property of the named texture.</p>\n<p>The optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is set to <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if data from texture at specified\ncoordinates is resident in memory, <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code> otherwise. When optional destination predicate <code class=\"docutils literal notranslate\"><span class=\"pre\">p</span></code> is\nset to <code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>, data loaded will be all zeros. Memory residency of Texture Data at specified\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\nlaunch. Refer to Driver API documentation for more details including any system/implementation\nspecific behavior.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> may be specified for <em>depth textures</em>. Depth textures are special type of\ntextures which hold data from the depth buffer. Depth buffer contains depth information of each\npixel. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> is <code class=\"docutils literal notranslate\"><span class=\"pre\">.f32</span></code> scalar value that specifies depth compare value for depth\ntextures. Each element fetched from texture is compared against value given in <code class=\"docutils literal notranslate\"><span class=\"pre\">f</span></code> operand. If\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\nused for the filtering.</p>\n<p>A texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4.2d</span></code></p>\n<p>For 2D textures, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> specifies coordinates as a two-element, 32-bit floating-point vector.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.s32</span></code> that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\nvalue is in the range of -8 to +7.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4.a2d</span></code></p>\n<p>Texture array selection, followed by <code class=\"docutils literal notranslate\"><span class=\"pre\">tld4</span></code> texture fetch of 2d texture. For 2d texture arrays\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is a four element, 32-bit vector. The first element in operand c is interpreted as an\nunsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the texture array, and the next two elements are interpreted\nas 32-bit floating point coordinates of 2d texture. The fourth element is ignored.</p>\n<p>An optional operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> may be specified. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is a vector of type <code class=\"docutils literal notranslate\"><span class=\"pre\">.v2.s32</span></code> that\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\nvalue is in the range of -8 to +7.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4.cube</span></code></p>\n<p>For cubemap textures, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> specifies four-element vector which comprises three\nfloating-point coordinates (s, t, r) and a fourth padding argument which is ignored.</p>\n<p>Cubemap textures are special two-dimensional layered textures consisting of six layers that\nrepresent the faces of a cube. All layers in a cubemap are of the same size and are square (i.e.,\nwidth equals height).</p>\n<p>Coordinates (s, t, r) are projected onto one of the six cube faces. The (s, t, r) coordinates can be\nthought of as a direction vector emanating from the center of the cube. Of the three coordinates (s,\nt, r), the coordinate of the largest magnitude (the major axis) selects the cube face. Then, the\nother two coordinates (the minor axes) are divided by the absolute value of the major axis to\nproduce a new (s, t) coordinate pair to lookup into the selected cube face.</p>\n<p>Offset vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is not supported for cubemap textures.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4.acube</span></code></p>\n<p>Cubemap array selection, followed by <code class=\"docutils literal notranslate\"><span class=\"pre\">tld4</span></code> texture fetch of cubemap texture. The first element in\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is interpreted as an unsigned integer index (<code class=\"docutils literal notranslate\"><span class=\"pre\">.u32</span></code>) into the cubemap texture array,\nand the remaining three elements are interpreted as floating-point cubemap coordinates (s, t, r),\nused to lookup in the selected cubemap.</p>\n<p>Offset vector operand <code class=\"docutils literal notranslate\"><span class=\"pre\">e</span></code> is not supported for cubemap texture arrays.</p>\n<p><strong>Indirect texture access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect texture access is supported in unified mode for target\narchitecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding\nthe address of a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> variable.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.2.</p>\n<p>Indirect texture access introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4.{a2d,cube,acube}</span></code> introduced in PTX ISA version 4.3.</p>\n<p>Offset vector operand introduced in PTX ISA version 4.3.</p>\n<p>Depth compare operand introduced in PTX ISA version 4.3.</p>\n<p>Support for optional destination predicate introduced in PTX ISA version 7.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Indirect texture access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">tld4.{a2d,cube,acube}</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Offset vector operand requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Depth compare operand requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p>Support for optional destination predicate requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_60</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>//Example of unified mode texturing\ntld4.r.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}];\n\n// Example of independent mode texturing\ntld4.r.2d.v4.u32.f32  {u1,u2,u3,u4}, [tex_a,smpl_x,{f1,f2}];\n\n// Example of unified mode texturing using offset\ntld4.r.2d.v4.s32.f32  {r1,r2,r3,r4}, [tex_a,{f1,f2}], {r5, r6};\n\n// Example of unified mode texturing using compare\ntld4.r.2d.v4.f32.f32  {f1,f2,f3,f4}, [tex_a,{f5,f6}], f7;\n\n// Example of optional destination predicate\ntld4.r.2d.v4.f32.f32 {f1,f2,f3,f4}|p, [tex_a,{f5,f6}], f7;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Perform a texture fetch of the 4-texel bilerp footprint.\n\nSyntax\n\ntld4.comp.2d.v4.dtype.f32    d[|p], [a, c] {, e} {, f};\n\ntld4.comp.geom.v4.dtype.f32  d[|p], [a, b, c] {, e} {, f};  // explicit sampler\n\n.comp  = { .r, .g, .b, .a };\n\n.geom  = { .2d, .a2d, .cube, .acube };\n\n.dtype = { .u32, .s32, .f32 };\n\nDescription\n\nTexture fetch of the 4-texel bilerp footprint using a texture coordinate vector. The instruction\n\nloads the bilerp footprint from the texture named by operand a at coordinates given by operand\n\nc into vector destination d. The texture component fetched for each texel sample is\n\nspecified by .comp. The four texel samples are placed into destination vector d in\n\ncounter-clockwise order starting at lower left.\n\nAn optional texture sampler b may be specified. If no sampler is specified, the sampler behavior\n\nis a property of the named texture.\n\nThe optional destination predicate p is set to True if data from texture at specified\n\ncoordinates is resident in memory, False otherwise. When optional destination predicate p is\n\nset to False, data loaded will be all zeros. Memory residency of Texture Data at specified\n\ncoordinates is dependent on execution environment setup using Driver API calls, prior to kernel\n\nlaunch. Refer to Driver API documentation for more details including any system/implementation\n\nspecific behavior.\n\nAn optional operand f may be specified for depth textures. Depth textures are special type of\n\ntextures which hold data from the depth buffer. Depth buffer contains depth information of each\n\npixel. Operand f is .f32 scalar value that specifies depth compare value for depth\n\ntextures. Each element fetched from texture is compared against value given in f operand. If\n\ncomparison passes, result is 1.0; otherwise result is 0.0. These per-element comparison results are\n\nused for the filtering.\n\nA texture base address is assumed to be aligned to a 16 byte boundary, and the address given by the\n\ncoordinate vector must be naturally aligned to a multiple of the access size. If an address is not\n\nproperly aligned, the resulting behavior is undefined; i.e., the access may proceed by silently\n\nmasking off low-order address bits to achieve proper rounding, or the instruction may fault.\n\ntld4.2d\n\nFor 2D textures, operand c specifies coordinates as a two-element, 32-bit floating-point vector.\n\nAn optional operand e may be specified. Operand e is a vector of type .v2.s32 that\n\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\n\nvalue is in the range of -8 to +7.\n\ntld4.a2d\n\nTexture array selection, followed by tld4 texture fetch of 2d texture. For 2d texture arrays\n\noperand c is a four element, 32-bit vector. The first element in operand c is interpreted as an\n\nunsigned integer index (.u32) into the texture array, and the next two elements are interpreted\n\nas 32-bit floating point coordinates of 2d texture. The fourth element is ignored.\n\nAn optional operand e may be specified. Operand e is a vector of type .v2.s32 that\n\nspecifies coordinate offset. Offset is applied to coordinates before doing texture fetch. Offset\n\nvalue is in the range of -8 to +7.\n\ntld4.cube\n\nFor cubemap textures, operand c specifies four-element vector which comprises three\n\nfloating-point coordinates (s, t, r) and a fourth padding argument which is ignored.\n\nCubemap textures are special two-dimensional layered textures consisting of six layers that\n\nrepresent the faces of a cube. All layers in a cubemap are of the same size and are square (i.e.,\n\nwidth equals height).\n\nCoordinates (s, t, r) are projected onto one of the six cube faces. The (s, t, r) coordinates can be\n\nthought of as a direction vector emanating from the center of the cube. Of the three coordinates (s,\n\nt, r), the coordinate of the largest magnitude (the major axis) selects the cube face. Then, the\n\nother two coordinates (the minor axes) are divided by the absolute value of the major axis to\n\nproduce a new (s, t) coordinate pair to lookup into the selected cube face.\n\nOffset vector opera ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tld4"
            };

        case "total_smem_size":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-total-smem-size\" target=\"_blank\" rel=\"noopener noreferrer\">total_smem_size <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %total_smem_size</h1><section id=\"special-registers-total-smem-size\">\n\n\n<p>Total size of shared memory used by a CTA of a kernel.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %total_smem_size;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register initialized with total size of shared memory allocated\n(statically and dynamically, excluding the shared memory reserved for the NVIDIA system software\nuse) for the CTA of a kernel at launch time.</p>\n<p>Size is returned in multiples of shared memory allocation unit size supported by target\narchitecture.</p>\n<p>Allocation unit values are as follows:</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 55%\"/>\n<col style=\"width: 45%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Target architecture</p></th>\n<th class=\"head\"><p>Shared memory allocation unit size</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_2x</span></code></p></td>\n<td><p>128 bytes</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_3x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_5x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_7x</span></code></p></td>\n<td><p>256 bytes</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">sm_8x</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_9x</span></code></p></td>\n<td><p>128 bytes</p></td>\n</tr>\n</tbody>\n</table>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 4.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %total_smem_size;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Total size of shared memory used by a CTA of a kernel.\n\nSyntax (predefined)\n\n.sreg .u32 %total_smem_size;\n\nDescription\n\nA predefined, read-only special register initialized with total size of shared memory allocated\n\n(statically and dynamically, excluding the shared memory reserved for the NVIDIA system software\n\nuse) for the CTA of a kernel at launch time.\n\nSize is returned in multiples of shared memory allocation unit size supported by target\n\narchitecture.\n\nAllocation unit values are as follows:\n\n\n\nTarget architecture\n\nShared memory allocation unit size\n\nsm_2x\n\n128 bytes\n\nsm_3x, sm_5x, sm_6x, sm_7x\n\n256 bytes\n\nsm_8x, sm_9x\n\n128 bytes\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 4.1.\n\nTarget ISA Notes\n\nRequires sm_20 or higher.\n\nExamples\n\nmov.u32  %r, %total_smem_size;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-total-smem-size"
            };

        case "trap":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap\" target=\"_blank\" rel=\"noopener noreferrer\">trap <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Miscellaneous Instructions: trap</h1><section id=\"miscellaneous-instructions-trap\">\n\n\n<p>Perform trap operation.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>trap;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Abort execution and generate an interrupt to the host CPU.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>    trap;\n@p  trap;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Perform trap operation.\n\nSyntax\n\ntrap;\n\nDescription\n\nAbort execution and generate an interrupt to the host CPU.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n    trap;\n\n@p  trap;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-trap"
            };

        case "txq":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq\" target=\"_blank\" rel=\"noopener noreferrer\">txq <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Texture Instructions: txq</h1><section id=\"texture-instructions-txq\">\n\n\n<p>Query texture and sampler attributes.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>txq.tquery.b32         d, [a];       // texture attributes\ntxq.level.tlquery.b32  d, [a], lod;  // texture attributes\ntxq.squery.b32         d, [a];       // sampler attributes\n\n.tquery  = { .width, .height, .depth,\n             .channel_data_type, .channel_order,\n             .normalized_coords, .array_size,\n             .num_mipmap_levels, .num_samples};\n\n.tlquery = { .width, .height, .depth };\n\n.squery  = { .force_unnormalized_coords, .filter_mode,\n             .addr_mode_0, addr_mode_1, addr_mode_2 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Query an attribute of a texture or sampler. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is either a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code> variable, or a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register.</p>\n<table class=\"table-no-stripes docutils align-default\">\n<colgroup>\n<col style=\"width: 12%\"/>\n<col style=\"width: 88%\"/>\n</colgroup>\n<thead>\n<tr class=\"row-odd\"><th class=\"head\"><p>Query</p></th>\n<th class=\"head\"><p>Returns</p></th>\n</tr>\n</thead>\n<tbody>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.width</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.height</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.depth</span></code></p>\n</td>\n<td><p>value in elements</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.channel_data_type</span></code></p></td>\n<td><p>Unsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_data_type</span></code> and channel_order queries.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.channel_order</span></code></p></td>\n<td><p>Unsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_data_type</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">channel_order</span></code> queries.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.normalized_coords</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> (<code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code>) or <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> (<code class=\"docutils literal notranslate\"><span class=\"pre\">False</span></code>).</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.force_unnormalized_coords</span></code></p></td>\n<td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">1</span></code> (<code class=\"docutils literal notranslate\"><span class=\"pre\">True)</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">0</span></code> (<code class=\"docutils literal notranslate\"><span class=\"pre\">False).</span></code> Defined only for <code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code> variables in independent texture mode. Overrides the <code class=\"docutils literal notranslate\"><span class=\"pre\">normalized_coords</span></code> field of a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> variable used with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code> in a <code class=\"docutils literal notranslate\"><span class=\"pre\">tex</span></code> instruction.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.filter_mode</span></code></p></td>\n<td><p>Integer from <code class=\"docutils literal notranslate\"><span class=\"pre\">enum</span> <span class=\"pre\">{</span> <span class=\"pre\">nearest,</span> <span class=\"pre\">linear</span> <span class=\"pre\">}</span></code></p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.addr_mode_0</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.addr_mode_1</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.addr_mode_2</span></code></p>\n</td>\n<td><p>Integer from <code class=\"docutils literal notranslate\"><span class=\"pre\">enum</span> <span class=\"pre\">{</span> <span class=\"pre\">wrap,</span> <span class=\"pre\">mirror,</span> <span class=\"pre\">clamp_ogl,</span> <span class=\"pre\">clamp_to_edge,</span> <span class=\"pre\">clamp_to_border</span> <span class=\"pre\">}</span></code></p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.array_size</span></code></p></td>\n<td><p>For a texture array, number of textures in array, 0 otherwise.</p></td>\n</tr>\n<tr class=\"row-even\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.num_mipmap_levels</span></code></p></td>\n<td><p>For a mipmapped texture, number of levels of details (LOD), 0 otherwise.</p></td>\n</tr>\n<tr class=\"row-odd\"><td><p><code class=\"docutils literal notranslate\"><span class=\"pre\">.num_samples</span></code></p></td>\n<td><p>For a multi-sample texture, number of samples, 0 otherwise.</p></td>\n</tr>\n</tbody>\n</table>\n<p>Texture attributes are queried by supplying a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> argument to <code class=\"docutils literal notranslate\"><span class=\"pre\">txq</span></code>. In unified mode,\nsampler attributes are also accessed via a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> argument, and in independent mode sampler\nattributes are accessed via a separate <code class=\"docutils literal notranslate\"><span class=\"pre\">.samplerref</span></code> argument.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">txq.level</span></code></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">txq.level</span></code> requires an additional 32bit integer argument, <code class=\"docutils literal notranslate\"><span class=\"pre\">lod</span></code>, which specifies LOD and\nqueries requested attribute for the specified LOD.</p>\n<p><strong>Indirect texture access</strong></p>\n<p>Beginning with PTX ISA version 3.1, indirect texture access is supported in unified mode for target\narchitecture <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher. In indirect access, operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.u64</span></code> register holding\nthe address of a <code class=\"docutils literal notranslate\"><span class=\"pre\">.texref</span></code> variable.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.5.</p>\n<p>Channel data type and channel order queries were added in PTX ISA version 2.1.</p>\n<p>The <code class=\"docutils literal notranslate\"><span class=\"pre\">.force_unnormalized_coords</span></code> query was added in PTX ISA version 2.2.</p>\n<p>Indirect texture access introduced in PTX ISA version 3.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">.array_size</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.num_mipmap_levels</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">.num_samples</span></code> samples queries were added in PTX ISA\nversion 4.1.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">txq.level</span></code> introduced in PTX ISA version 4.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p>Indirect texture access requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Querying the number of mipmap levels requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p>Querying the number of samples requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">txq.level</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>txq.width.b32       %r1, [tex_A];\ntxq.filter_mode.b32 %r1, [tex_A];   // unified mode\ntxq.addr_mode_0.b32 %r1, [smpl_B];  // independent mode\ntxq.level.width.b32 %r1, [tex_A], %r_lod;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Query texture and sampler attributes.\n\nSyntax\n\ntxq.tquery.b32         d, [a];       // texture attributes\n\ntxq.level.tlquery.b32  d, [a], lod;  // texture attributes\n\ntxq.squery.b32         d, [a];       // sampler attributes\n\n.tquery  = { .width, .height, .depth,\n\n             .channel_data_type, .channel_order,\n\n             .normalized_coords, .array_size,\n\n             .num_mipmap_levels, .num_samples};\n\n.tlquery = { .width, .height, .depth };\n\n.squery  = { .force_unnormalized_coords, .filter_mode,\n\n             .addr_mode_0, addr_mode_1, addr_mode_2 };\n\nDescription\n\nQuery an attribute of a texture or sampler. Operand a is either a .texref or .samplerref variable, or a .u64 register.\n\n\n\nQuery\n\nReturns\n\n.width\n\n.height\n\n.depth\n\nvalue in elements\n\n.channel_data_type\n\nUnsigned integer corresponding to source language\u2019s channel data type enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.channel_order\n\nUnsigned integer corresponding to source language\u2019s channel order enumeration. If the source language combines channel data type and channel order into a single enumeration type, that value is returned for both channel_data_type and channel_order queries.\n\n.normalized_coords\n\n1 (True) or 0 (False).\n\n.force_unnormalized_coords\n\n1 (True) or 0 (False). Defined only for .samplerref variables in independent texture mode. Overrides the normalized_coords field of a .texref variable used with a .samplerref in a tex instruction.\n\n.filter_mode\n\nInteger from enum { nearest, linear }\n\n.addr_mode_0\n\n.addr_mode_1\n\n.addr_mode_2\n\nInteger from enum { wrap, mirror, clamp_ogl, clamp_to_edge, clamp_to_border }\n\n.array_size\n\nFor a texture array, number of textures in array, 0 otherwise.\n\n.num_mipmap_levels\n\nFor a mipmapped texture, number of levels of details (LOD), 0 otherwise.\n\n.num_samples\n\nFor a multi-sample texture, number of samples, 0 otherwise.\n\nTexture attributes are queried by supplying a .texref argument to txq. In unified mode,\n\nsampler attributes are also accessed via a .texref argument, and in independent mode sampler\n\nattributes are accessed via a separate .samplerref argument.\n\ntxq.level\n\ntxq.level requires an additional 32bit integer argument, lod, which specifies LOD and\n\nqueries requested attribute for the specified LOD.\n\nIndirect texture access\n\nBeginning with PTX ISA version 3.1, indirect texture access is supported in unified mode for target\n\narchitecture sm_20 or higher. In indirect access, operand a is a .u64 register holding\n\nthe address of a .texref variable.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.5.\n\nChannel data type and channel order queries were added in PTX ISA version 2.1.\n\nThe .force_unnormalized_coords query was added in PTX ISA version 2.2.\n\nIndirect texture access introduced in PTX ISA version 3.1.\n\n.array_size, .num_mipmap_levels, .num_samples samples queries were added in PTX ISA\n\nversion 4.1.\n\ntxq.level introduced in PTX ISA version 4.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nIndirect texture access requires sm_20 or higher.\n\nQuerying the number of mipmap levels requires sm_20 or higher.\n\nQuerying the number of samples requires sm_30 or higher.\n\ntxq.level requires sm_30 or higher.\n\nExamples\n\ntxq.width.b32       %r1, [tex_A];\n\ntxq.filter_mode.b32 %r1, [tex_A];   // unified mode\n\ntxq.addr_mode_0.b32 %r1, [smpl_B];  // independent mode\n\ntxq.level.width.b32 %r1, [tex_A], %r_lod;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-txq"
            };

        case "vabsdiff":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\" target=\"_blank\" rel=\"noopener noreferrer\">vabsdiff <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax</h1><section id=\"scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\">\n\n\n<p>Integer byte/half-word/word addition/subtraction.</p>\n<p><strong>vabsdiff</strong></p>\n<p>Integer byte/half-word/word absolute value of difference.</p>\n<p><strong>vmin, vmax</strong></p>\n<p>Integer byte/half-word/word minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n    case vadd:     tmp = ta + tb;\n\n    case vsub:     tmp = ta - tb;\n\n    case vabsdiff: tmp = | ta - tb |;\n\n    case vmin:     tmp = MIN( ta, tb );\n\n    case vmax:     tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax"
            };

        case "vabsdiff2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\" target=\"_blank\" rel=\"noopener noreferrer\">vabsdiff2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2</h1><section id=\"simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\">\n\n\n<p>Integer dual half-word SIMD addition/subtraction.</p>\n<p><strong>vavrg2</strong></p>\n<p>Integer dual half-word SIMD average.</p>\n<p><strong>vabsdiff2</strong></p>\n<p>Integer dual half-word SIMD absolute value of difference.</p>\n<p><strong>vmin2, vmax2</strong></p>\n<p>Integer dual half-word SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i&lt;2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    switch ( vop2 ) {\n\n       case vadd2:             t[i] = Va[i] + Vb[i];\n\n       case vsub2:             t[i] = Va[i] - Vb[i];\n\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2"
            };

        case "vabsdiff4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\" target=\"_blank\" rel=\"noopener noreferrer\">vabsdiff4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4</h1><section id=\"simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\">\n\n\n<p>Integer quad byte SIMD addition/subtraction.</p>\n<p><strong>vavrg4</strong></p>\n<p>Integer quad byte SIMD average.</p>\n<p><strong>vabsdiff4</strong></p>\n<p>Integer quad byte SIMD absolute value of difference.</p>\n<p><strong>vmin4, vmax4</strong></p>\n<p>Integer quad byte SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax4</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    switch ( vop4 ) {\n\n        case vadd4:            t[i] = Va[i] + Vb[i];\n\n        case vsub4:            t[i] = Va[i] - Vb[i];\n\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4"
            };

        case "vadd":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\" target=\"_blank\" rel=\"noopener noreferrer\">vadd <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax</h1><section id=\"scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\">\n\n\n<p>Integer byte/half-word/word addition/subtraction.</p>\n<p><strong>vabsdiff</strong></p>\n<p>Integer byte/half-word/word absolute value of difference.</p>\n<p><strong>vmin, vmax</strong></p>\n<p>Integer byte/half-word/word minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n    case vadd:     tmp = ta + tb;\n\n    case vsub:     tmp = ta - tb;\n\n    case vabsdiff: tmp = | ta - tb |;\n\n    case vmin:     tmp = MIN( ta, tb );\n\n    case vmax:     tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax"
            };

        case "vadd2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\" target=\"_blank\" rel=\"noopener noreferrer\">vadd2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2</h1><section id=\"simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\">\n\n\n<p>Integer dual half-word SIMD addition/subtraction.</p>\n<p><strong>vavrg2</strong></p>\n<p>Integer dual half-word SIMD average.</p>\n<p><strong>vabsdiff2</strong></p>\n<p>Integer dual half-word SIMD absolute value of difference.</p>\n<p><strong>vmin2, vmax2</strong></p>\n<p>Integer dual half-word SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i&lt;2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    switch ( vop2 ) {\n\n       case vadd2:             t[i] = Va[i] + Vb[i];\n\n       case vsub2:             t[i] = Va[i] - Vb[i];\n\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2"
            };

        case "vadd4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\" target=\"_blank\" rel=\"noopener noreferrer\">vadd4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4</h1><section id=\"simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\">\n\n\n<p>Integer quad byte SIMD addition/subtraction.</p>\n<p><strong>vavrg4</strong></p>\n<p>Integer quad byte SIMD average.</p>\n<p><strong>vabsdiff4</strong></p>\n<p>Integer quad byte SIMD absolute value of difference.</p>\n<p><strong>vmin4, vmax4</strong></p>\n<p>Integer quad byte SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax4</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    switch ( vop4 ) {\n\n        case vadd4:            t[i] = Va[i] + Vb[i];\n\n        case vsub4:            t[i] = Va[i] - Vb[i];\n\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4"
            };

        case "vavrg2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\" target=\"_blank\" rel=\"noopener noreferrer\">vavrg2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2</h1><section id=\"simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\">\n\n\n<p>Integer dual half-word SIMD addition/subtraction.</p>\n<p><strong>vavrg2</strong></p>\n<p>Integer dual half-word SIMD average.</p>\n<p><strong>vabsdiff2</strong></p>\n<p>Integer dual half-word SIMD absolute value of difference.</p>\n<p><strong>vmin2, vmax2</strong></p>\n<p>Integer dual half-word SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i&lt;2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    switch ( vop2 ) {\n\n       case vadd2:             t[i] = Va[i] + Vb[i];\n\n       case vsub2:             t[i] = Va[i] - Vb[i];\n\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2"
            };

        case "vavrg4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\" target=\"_blank\" rel=\"noopener noreferrer\">vavrg4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4</h1><section id=\"simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\">\n\n\n<p>Integer quad byte SIMD addition/subtraction.</p>\n<p><strong>vavrg4</strong></p>\n<p>Integer quad byte SIMD average.</p>\n<p><strong>vabsdiff4</strong></p>\n<p>Integer quad byte SIMD absolute value of difference.</p>\n<p><strong>vmin4, vmax4</strong></p>\n<p>Integer quad byte SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax4</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    switch ( vop4 ) {\n\n        case vadd4:            t[i] = Va[i] + Vb[i];\n\n        case vsub4:            t[i] = Va[i] - Vb[i];\n\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4"
            };

        case "version":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-version\" target=\"_blank\" rel=\"noopener noreferrer\">version <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>PTX Module Directives: .version</h1><section id=\"ptx-module-directives-version\">\n\n\n<p>PTX ISA version number.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.version  major.minor    // major, minor are integers\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Specifies the PTX language version number.</p>\n<p>The <em>major</em> number is incremented when there are incompatible changes to the PTX language, such as\nchanges to the syntax or semantics. The version major number is used by the PTX compiler to ensure\ncorrect execution of legacy PTX code.</p>\n<p>The <em>minor</em> number is incremented when new features are added to PTX.</p>\n<p><strong>Semantics</strong></p>\n<p>Indicates that this module must be compiled with tools that support an equal or greater version\nnumber.</p>\n<p>Each PTX module must begin with a <code class=\"docutils literal notranslate\"><span class=\"pre\">.version</span></code> directive, and no other <code class=\"docutils literal notranslate\"><span class=\"pre\">.version</span></code> directive is\nallowed anywhere else within the module.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.version 3.1\n.version 3.0\n.version 2.3\n</pre></div>\n</div>\n</section>",
                "tooltip": "PTX ISA version number.\n\nSyntax\n\n.version  major.minor    // major, minor are integers\n\nDescription\n\nSpecifies the PTX language version number.\n\nThe major number is incremented when there are incompatible changes to the PTX language, such as\n\nchanges to the syntax or semantics. The version major number is used by the PTX compiler to ensure\n\ncorrect execution of legacy PTX code.\n\nThe minor number is incremented when new features are added to PTX.\n\nSemantics\n\nIndicates that this module must be compiled with tools that support an equal or greater version\n\nnumber.\n\nEach PTX module must begin with a .version directive, and no other .version directive is\n\nallowed anywhere else within the module.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.version 3.1\n\n.version 3.0\n\n.version 2.3\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#ptx-module-directives-version"
            };

        case "visible":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-visible\" target=\"_blank\" rel=\"noopener noreferrer\">visible <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Linking Directives: .visible</h1><section id=\"linking-directives-visible\">\n\n\n<p>Visible (externally) symbol declaration.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.visible identifier\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares identifier to be globally visible. Unlike C, where identifiers are globally visible unless\ndeclared static, PTX identifiers are visible only within the current module unless declared\n<code class=\"docutils literal notranslate\"><span class=\"pre\">.visible</span></code> outside the current.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.visible .global .b32 foo;  // foo will be externally visible\n</pre></div>\n</div>\n</section>",
                "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.visible identifier\n\nDescription\n\nDeclares identifier to be globally visible. Unlike C, where identifiers are globally visible unless\n\ndeclared static, PTX identifiers are visible only within the current module unless declared\n\n.visible outside the current.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.visible .global .b32 foo;  // foo will be externally visible\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-visible"
            };

        case "vmad":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad\" target=\"_blank\" rel=\"noopener noreferrer\">vmad <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vmad</h1><section id=\"scalar-video-instructions-vmad\">\n\n\n<p>Integer byte/half-word/word multiply-accumulate.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation\nvmad.dtype.atype.btype{.sat}{.scale}     d, {-}a{.asel}, {-}b{.bsel},\n                                         {-}c;\nvmad.dtype.atype.btype.po{.sat}{.scale}  d, a{.asel}, b{.bsel}, c;\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.scale = { .shr7, .shr15 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Calculate <code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span> <span class=\"pre\">+</span> <span class=\"pre\">c</span></code>, with optional operand negates, <em>plus one</em> mode, and scaling.</p>\n<p>The source operands support optional negation with some restrictions. Although PTX syntax allows\nseparate negation of the <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operands, internally this is represented as negation of the\nproduct <code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span></code>. That is, <code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span></code> is negated if and only if exactly one of <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is\nnegated. PTX allows negation of either <code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span></code> or <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>.</p>\n<p>The plus one mode (<code class=\"docutils literal notranslate\"><span class=\"pre\">.po</span></code>) computes <code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span> <span class=\"pre\">+</span> <span class=\"pre\">c</span> <span class=\"pre\">+</span> <span class=\"pre\">1</span></code>, which is used in computing averages. Source\noperands may not be negated in <code class=\"docutils literal notranslate\"><span class=\"pre\">.po</span></code> mode.</p>\n<p>The intermediate result of <code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span></code> is unsigned if atype and btype are unsigned and the product\n<code class=\"docutils literal notranslate\"><span class=\"pre\">(a*b)</span></code> is not negated; otherwise, the intermediate result is signed. Input <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> has the same\nsign as the intermediate result.</p>\n<p>The final result is unsigned if the intermediate result is unsigned and <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is not negated.</p>\n<p>Depending on the sign of the <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> operands, and the operand negates, the following\ncombinations of operands are supported for VMAD:</p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span> (u32 * u32) + u32  // intermediate unsigned; final unsigned\n-(u32 * u32) + s32  // intermediate   signed; final   signed\n (u32 * u32) - u32  // intermediate unsigned; final   signed\n (u32 * s32) + s32  // intermediate   signed; final   signed\n-(u32 * s32) + s32  // intermediate   signed; final   signed\n (u32 * s32) - s32  // intermediate   signed; final   signed\n (s32 * u32) + s32  // intermediate   signed; final   signed\n-(s32 * u32) + s32  // intermediate   signed; final   signed\n (s32 * u32) - s32  // intermediate   signed; final   signed\n (s32 * s32) + s32  // intermediate   signed; final   signed\n-(s32 * s32) + s32  // intermediate   signed; final   signed\n (s32 * s32) - s32  // intermediate   signed; final   signed\n</pre></div>\n</div>\n<p>The intermediate result is optionally scaled via right-shift; this result is sign-extended if the\nfinal result is signed, and zero-extended otherwise.</p>\n<p>The final result is optionally saturated to the appropriate 32-bit range based on the type (signed\nor unsigned) of the final result.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\nsignedFinal = isSigned(atype) || isSigned(btype) ||\n                                 (a.negate ^ b.negate) || c.negate;\ntmp[127:0] = ta * tb;\n\nlsb = 0;\nif ( .po )                  {              lsb = 1; } else\nif ( a.negate ^ b.negate )  { tmp = ~tmp;  lsb = 1; } else\nif ( c.negate )             { c   = ~c;    lsb = 1; }\n\nc128[127:0] = (signedFinal) sext32( c ) : zext ( c );\ntmp = tmp + c128 + lsb;\nswitch( scale ) {\n   case .shr7:   result = (tmp &gt;&gt;  7) &amp; 0xffffffffffffffff;\n   case .shr15:  result = (tmp &gt;&gt; 15) &amp; 0xffffffffffffffff;\n}\nif ( .sat ) {\n     if (signedFinal) result = CLAMP(result, S32_MAX, S32_MIN);\n     else             result = CLAMP(result, U32_MAX, U32_MIN);\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vmad</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vmad.s32.s32.u32.sat    r0, r1, r2, -r3;\nvmad.u32.u32.u32.shr15  r0, r1.h0, r2.h0, r3;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word multiply-accumulate.\n\nSyntax\n\n// 32-bit scalar operation\n\nvmad.dtype.atype.btype{.sat}{.scale}     d, {-}a{.asel}, {-}b{.bsel},\n\n                                         {-}c;\n\nvmad.dtype.atype.btype.po{.sat}{.scale}  d, a{.asel}, b{.bsel}, c;\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.scale = { .shr7, .shr15 };\n\nDescription\n\nCalculate (a*b) + c, with optional operand negates, plus one mode, and scaling.\n\nThe source operands support optional negation with some restrictions. Although PTX syntax allows\n\nseparate negation of the a and b operands, internally this is represented as negation of the\n\nproduct (a*b). That is, (a*b) is negated if and only if exactly one of a or b is\n\nnegated. PTX allows negation of either (a*b) or c.\n\nThe plus one mode (.po) computes (a*b) + c + 1, which is used in computing averages. Source\n\noperands may not be negated in .po mode.\n\nThe intermediate result of (a*b) is unsigned if atype and btype are unsigned and the product\n\n(a*b) is not negated; otherwise, the intermediate result is signed. Input c has the same\n\nsign as the intermediate result.\n\nThe final result is unsigned if the intermediate result is unsigned and c is not negated.\n\nDepending on the sign of the a and b operands, and the operand negates, the following\n\ncombinations of operands are supported for VMAD:\n\n (u32 * u32) + u32  // intermediate unsigned; final unsigned\n\n-(u32 * u32) + s32  // intermediate   signed; final   signed\n\n (u32 * u32) - u32  // intermediate unsigned; final   signed\n\n (u32 * s32) + s32  // intermediate   signed; final   signed\n\n-(u32 * s32) + s32  // intermediate   signed; final   signed\n\n (u32 * s32) - s32  // intermediate   signed; final   signed\n\n (s32 * u32) + s32  // intermediate   signed; final   signed\n\n-(s32 * u32) + s32  // intermediate   signed; final   signed\n\n (s32 * u32) - s32  // intermediate   signed; final   signed\n\n (s32 * s32) + s32  // intermediate   signed; final   signed\n\n-(s32 * s32) + s32  // intermediate   signed; final   signed\n\n (s32 * s32) - s32  // intermediate   signed; final   signed\n\nThe intermediate result is optionally scaled via right-shift; this result is sign-extended if the\n\nfinal result is signed, and zero-extended otherwise.\n\nThe final result is optionally saturated to the appropriate 32-bit range based on the type (signed\n\nor unsigned) of the final result.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nsignedFinal = isSigned(atype) || isSigned(btype) ||\n\n                                 (a.negate ^ b.negate) || c.negate;\n\ntmp[127:0] = ta * tb;\n\nlsb = 0;\n\nif ( .po )                  {              lsb = 1; } else\n\nif ( a.negate ^ b.negate )  { tmp = ~tmp;  lsb = 1; } else\n\nif ( c.negate )             { c   = ~c;    lsb = 1; }\n\nc128[127:0] = (signedFinal) sext32( c ) : zext ( c );\n\ntmp = tmp + c128 + lsb;\n\nswitch( scale ) {\n\n   case .shr7:   result = (tmp >>  7) & 0xffffffffffffffff;\n\n   case .shr15:  result = (tmp >> 15) & 0xffffffffffffffff;\n\n}\n\nif ( .sat ) {\n\n     if (signedFinal) result = CLAMP(result, S32_MAX, S32_MIN);\n\n     else             result = CLAMP(result, U32_MAX, U32_MIN);\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvmad requires sm_20 or higher.\n\nExamples\n\nvmad.s32.s32.u32.sat    r0, r1, r2, -r3;\n\nvmad.u32.u32.u32.shr15  r0, r1.h0, r2.h0, r3;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vmad"
            };

        case "vmax":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\" target=\"_blank\" rel=\"noopener noreferrer\">vmax <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax</h1><section id=\"scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\">\n\n\n<p>Integer byte/half-word/word addition/subtraction.</p>\n<p><strong>vabsdiff</strong></p>\n<p>Integer byte/half-word/word absolute value of difference.</p>\n<p><strong>vmin, vmax</strong></p>\n<p>Integer byte/half-word/word minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n    case vadd:     tmp = ta + tb;\n\n    case vsub:     tmp = ta - tb;\n\n    case vabsdiff: tmp = | ta - tb |;\n\n    case vmin:     tmp = MIN( ta, tb );\n\n    case vmax:     tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax"
            };

        case "vmax2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\" target=\"_blank\" rel=\"noopener noreferrer\">vmax2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2</h1><section id=\"simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\">\n\n\n<p>Integer dual half-word SIMD addition/subtraction.</p>\n<p><strong>vavrg2</strong></p>\n<p>Integer dual half-word SIMD average.</p>\n<p><strong>vabsdiff2</strong></p>\n<p>Integer dual half-word SIMD absolute value of difference.</p>\n<p><strong>vmin2, vmax2</strong></p>\n<p>Integer dual half-word SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i&lt;2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    switch ( vop2 ) {\n\n       case vadd2:             t[i] = Va[i] + Vb[i];\n\n       case vsub2:             t[i] = Va[i] - Vb[i];\n\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2"
            };

        case "vmax4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\" target=\"_blank\" rel=\"noopener noreferrer\">vmax4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4</h1><section id=\"simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\">\n\n\n<p>Integer quad byte SIMD addition/subtraction.</p>\n<p><strong>vavrg4</strong></p>\n<p>Integer quad byte SIMD average.</p>\n<p><strong>vabsdiff4</strong></p>\n<p>Integer quad byte SIMD absolute value of difference.</p>\n<p><strong>vmin4, vmax4</strong></p>\n<p>Integer quad byte SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax4</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    switch ( vop4 ) {\n\n        case vadd4:            t[i] = Va[i] + Vb[i];\n\n        case vsub4:            t[i] = Va[i] - Vb[i];\n\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4"
            };

        case "vmin":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\" target=\"_blank\" rel=\"noopener noreferrer\">vmin <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax</h1><section id=\"scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\">\n\n\n<p>Integer byte/half-word/word addition/subtraction.</p>\n<p><strong>vabsdiff</strong></p>\n<p>Integer byte/half-word/word absolute value of difference.</p>\n<p><strong>vmin, vmax</strong></p>\n<p>Integer byte/half-word/word minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n    case vadd:     tmp = ta + tb;\n\n    case vsub:     tmp = ta - tb;\n\n    case vabsdiff: tmp = | ta - tb |;\n\n    case vmin:     tmp = MIN( ta, tb );\n\n    case vmax:     tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax"
            };

        case "vmin2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\" target=\"_blank\" rel=\"noopener noreferrer\">vmin2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2</h1><section id=\"simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\">\n\n\n<p>Integer dual half-word SIMD addition/subtraction.</p>\n<p><strong>vavrg2</strong></p>\n<p>Integer dual half-word SIMD average.</p>\n<p><strong>vabsdiff2</strong></p>\n<p>Integer dual half-word SIMD absolute value of difference.</p>\n<p><strong>vmin2, vmax2</strong></p>\n<p>Integer dual half-word SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i&lt;2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    switch ( vop2 ) {\n\n       case vadd2:             t[i] = Va[i] + Vb[i];\n\n       case vsub2:             t[i] = Va[i] - Vb[i];\n\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2"
            };

        case "vmin4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\" target=\"_blank\" rel=\"noopener noreferrer\">vmin4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4</h1><section id=\"simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\">\n\n\n<p>Integer quad byte SIMD addition/subtraction.</p>\n<p><strong>vavrg4</strong></p>\n<p>Integer quad byte SIMD average.</p>\n<p><strong>vabsdiff4</strong></p>\n<p>Integer quad byte SIMD absolute value of difference.</p>\n<p><strong>vmin4, vmax4</strong></p>\n<p>Integer quad byte SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax4</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    switch ( vop4 ) {\n\n        case vadd4:            t[i] = Va[i] + Vb[i];\n\n        case vsub4:            t[i] = Va[i] - Vb[i];\n\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4"
            };

        case "vote":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync\" target=\"_blank\" rel=\"noopener noreferrer\">vote.sync <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Parallel Synchronization and Communication Instructions: vote.sync</h1><section id=\"parallel-synchronization-and-communication-instructions-vote-sync\">\n\n\n<p>Vote across thread group.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vote.sync.mode.pred  d, {!}a, membermask;\nvote.sync.ballot.b32 d, {!}a, membermask;  // 'ballot' form, returns bitmask\n\n.mode = { .all, .any, .uni };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync</span></code> will cause executing thread to wait until all non-exited threads corresponding to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> have executed <code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync</span></code> with the same qualifiers and same <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> value\nbefore resuming execution.</p>\n<p>Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> specifies a 32-bit integer which is a mask indicating threads participating\nin this instruction where the bit position corresponds to thread\u2019s <code class=\"docutils literal notranslate\"><span class=\"pre\">laneid</span></code>. Operand <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> is a\npredicate register.</p>\n<p>In the <em>mode</em> form, <code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync</span></code> performs a reduction of the source predicate across all non-exited\nthreads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. The destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a predicate register and its value is\nthe same across all threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<p>The reduction modes are:</p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.all</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if source predicate is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> for all non-exited threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. Negate the\nsource predicate to compute <code class=\"docutils literal notranslate\"><span class=\"pre\">.none</span></code>.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.any</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if source predicate is <code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> for some thread in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. Negate the source\npredicate to compute <code class=\"docutils literal notranslate\"><span class=\"pre\">.not_all</span></code>.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code></dt><dd><p><code class=\"docutils literal notranslate\"><span class=\"pre\">True</span></code> if source predicate has the same value in all non-exited threads in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>. Negating the source predicate also computes <code class=\"docutils literal notranslate\"><span class=\"pre\">.uni</span></code>.</p>\n</dd>\n</dl>\n<p>In the <em>ballot</em> form, the destination operand <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> is a <code class=\"docutils literal notranslate\"><span class=\"pre\">.b32</span></code> register. In this form,\n<code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync.ballot.b32</span></code> simply copies the predicate from each thread in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> into the\ncorresponding bit position of destination register <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>, where the bit position corresponds to the\nthread\u2019s lane id.</p>\n<p>A thread not specified in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> will contribute a 0 for its entry in\n<code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync.ballot.b32</span></code>.</p>\n<p>The behavior of <code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync</span></code> is undefined if the executing thread is not in the <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code>.</p>\n<div class=\"admonition note\">\n<p class=\"admonition-title\">Note</p>\n<p>For .target <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_6x</span></code> or below, all threads in <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> must execute the same <code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync</span></code>\ninstruction in convergence, and only threads belonging to some <code class=\"docutils literal notranslate\"><span class=\"pre\">membermask</span></code> can be active when\nthe <code class=\"docutils literal notranslate\"><span class=\"pre\">vote.sync</span></code> instruction is executed. Otherwise, the behavior is undefined.</p>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 6.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vote.sync.all.pred    p,q,0xffffffff;\nvote.sync.ballot.b32  r1,p,0xffffffff;  // get 'ballot' across warp\n</pre></div>\n</div>\n</section>",
                "tooltip": "Vote across thread group.\n\nSyntax\n\nvote.sync.mode.pred  d, {!}a, membermask;\n\nvote.sync.ballot.b32 d, {!}a, membermask;  // 'ballot' form, returns bitmask\n\n.mode = { .all, .any, .uni };\n\nDescription\n\nvote.sync will cause executing thread to wait until all non-exited threads corresponding to\n\nmembermask have executed vote.sync with the same qualifiers and same membermask value\n\nbefore resuming execution.\n\nOperand membermask specifies a 32-bit integer which is a mask indicating threads participating\n\nin this instruction where the bit position corresponds to thread\u2019s laneid. Operand a is a\n\npredicate register.\n\nIn the mode form, vote.sync performs a reduction of the source predicate across all non-exited\n\nthreads in membermask. The destination operand d is a predicate register and its value is\n\nthe same across all threads in membermask.\n\nThe reduction modes are:\n\n.allTrue if source predicate is True for all non-exited threads in membermask. Negate the\n\nsource predicate to compute .none.\n\n.anyTrue if source predicate is True for some thread in membermask. Negate the source\n\npredicate to compute .not_all.\n\n.uniTrue if source predicate has the same value in all non-exited threads in\n\nmembermask. Negating the source predicate also computes .uni.\n\nIn the ballot form, the destination operand d is a .b32 register. In this form,\n\nvote.sync.ballot.b32 simply copies the predicate from each thread in membermask into the\n\ncorresponding bit position of destination register d, where the bit position corresponds to the\n\nthread\u2019s lane id.\n\nA thread not specified in membermask will contribute a 0 for its entry in\n\nvote.sync.ballot.b32.\n\nThe behavior of vote.sync is undefined if the executing thread is not in the membermask.\n\nNote\n\nFor .target sm_6x or below, all threads in membermask must execute the same vote.sync\n\ninstruction in convergence, and only threads belonging to some membermask can be active when\n\nthe vote.sync instruction is executed. Otherwise, the behavior is undefined.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 6.0.\n\nTarget ISA Notes\n\nRequires sm_30 or higher.\n\nExamples\n\nvote.sync.all.pred    p,q,0xffffffff;\n\nvote.sync.ballot.b32  r1,p,0xffffffff;  // get 'ballot' across warp\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync"
            };

        case "vset":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset\" target=\"_blank\" rel=\"noopener noreferrer\">vset <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vset</h1><section id=\"scalar-video-instructions-vset\">\n\n\n<p>Integer byte/half-word/word comparison.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvset.atype.btype.cmp       d, a{.asel}, b{.bsel};\nvset.atype.btype.cmp.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvset.atype.btype.cmp  d.dsel, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compare input values using specified comparison, with optional secondary arithmetic operation or\nsubword data merge.</p>\n<p>The intermediate result of the comparison is always unsigned, and therefore destination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and\noperand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are also unsigned.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\ntmp = compare( ta, tb, cmp ) ? 1 : 0;\nd = optSecondaryOp( op2, tmp, c );    // optional secondary operation\nd = optMerge( dsel, tmp, c );         // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vset</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vset.s32.u32.lt    r1, r2, r3;\nvset.u32.u32.ne    r1, r2, r3.h1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word comparison.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvset.atype.btype.cmp       d, a{.asel}, b{.bsel};\n\nvset.atype.btype.cmp.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvset.atype.btype.cmp  d.dsel, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nCompare input values using specified comparison, with optional secondary arithmetic operation or\n\nsubword data merge.\n\nThe intermediate result of the comparison is always unsigned, and therefore destination d and\n\noperand c are also unsigned.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\ntmp = compare( ta, tb, cmp ) ? 1 : 0;\n\nd = optSecondaryOp( op2, tmp, c );    // optional secondary operation\n\nd = optMerge( dsel, tmp, c );         // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvset requires sm_20 or higher.\n\nExamples\n\nvset.s32.u32.lt    r1, r2, r3;\n\nvset.u32.u32.ne    r1, r2, r3.h1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vset"
            };

        case "vset2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2\" target=\"_blank\" rel=\"noopener noreferrer\">vset2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vset2</h1><section id=\"simd-video-instructions-vset2\">\n\n\n<p>Integer dual half-word SIMD comparison.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvset2.atype.btype.cmp  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvset2.atype.btype.cmp.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel comparison with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then compared in parallel.</p>\n<p>The intermediate result of the comparison is always unsigned, and therefore the half-words of\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are also unsigned.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\nfor (i=0; i&lt;2; i++) {\n    t[i] = compare( Va[i], Vb[i], .cmp ) ? 1 : 0;\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vset2</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vset2.s32.u32.lt      r1, r2, r3, r0;\nvset2.u32.u32.ne.add  r1, r2, r3, r0;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD comparison.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvset2.atype.btype.cmp  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvset2.atype.btype.cmp.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel comparison with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then compared in parallel.\n\nThe intermediate result of the comparison is always unsigned, and therefore the half-words of\n\ndestination d and operand c are also unsigned.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand b\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    t[i] = compare( Va[i], Vb[i], .cmp ) ? 1 : 0;\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvset2 requires sm_30 or higher.\n\nExamples\n\nvset2.s32.u32.lt      r1, r2, r3, r0;\n\nvset2.u32.u32.ne.add  r1, r2, r3, r0;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset2"
            };

        case "vset4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4\" target=\"_blank\" rel=\"noopener noreferrer\">vset4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vset4</h1><section id=\"simd-video-instructions-vset4\">\n\n\n<p>Integer quad byte SIMD comparison.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvset4.atype.btype.cmp  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvset4.atype.btype.cmp.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel comparison with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then compared in parallel.</p>\n<p>The intermediate result of the comparison is always unsigned, and therefore the bytes of destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code> and operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> are also unsigned.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    t[i] = compare( Va[i], Vb[i], cmp ) ? 1 : 0;\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vset4</span></code> requires <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vset4.s32.u32.lt      r1, r2, r3, r0;\nvset4.u32.u32.ne.max  r1, r2, r3, r0;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD comparison.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvset4.atype.btype.cmp  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvset4.atype.btype.cmp.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n.atype = .btype = { .u32, .s32 };\n\n.cmp   = { .eq, .ne, .lt, .le, .gt, .ge };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel comparison with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then compared in parallel.\n\nThe intermediate result of the comparison is always unsigned, and therefore the bytes of destination\n\nd and operand c are also unsigned.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand b is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    t[i] = compare( Va[i], Vb[i], cmp ) ? 1 : 0;\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvset4 requires sm_30 or higher.\n\nExamples\n\nvset4.s32.u32.lt      r1, r2, r3, r0;\n\nvset4.u32.u32.ne.max  r1, r2, r3, r0;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vset4"
            };

        case "vshl":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr\" target=\"_blank\" rel=\"noopener noreferrer\">vshl <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vshl, vshr</h1><section id=\"scalar-video-instructions-vshl-vshr\">\n\n\n<p>Integer byte/half-word/word left/right shift.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.u32{.sat}.mode       d, a{.asel}, b{.bsel};\nvop.dtype.atype.u32{.sat}.mode.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.u32{.sat}.mode  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vshl, vshr };\n.dtype = .atype = { .u32, .s32 };\n.mode  = { .clamp, .wrap };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">vshl</span></code></dt><dd><p>Shift <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> left by unsigned amount in <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> with optional saturate, and optional secondary\narithmetic operation or subword data merge. Left shift fills with zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">vshr</span></code></dt><dd><p>Shift <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> right by unsigned amount in <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> with optional saturate, and optional secondary\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\nfills with zero.</p>\n</dd>\n</dl>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a,atype, asel );\ntb = partSelectSignExtend( b, .u32, bsel );\nif ( mode == .clamp  &amp;&amp; tb &gt; 32 )  tb = 32;\nif ( mode == .wrap )                       tb = tb &amp; 0x1f;\nswitch ( vop ){\n   case vshl:  tmp = ta &lt;&lt; tb;\n   case vshr:  tmp = ta &gt;&gt; tb;\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vshl</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vshr</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vshl.s32.u32.u32.clamp  r1, r2, r3;\nvshr.u32.u32.u32.wrap   r1, r2, r3.h1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word left/right shift.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.u32{.sat}.mode       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.u32{.sat}.mode.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.u32{.sat}.mode  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vshl, vshr };\n\n.dtype = .atype = { .u32, .s32 };\n\n.mode  = { .clamp, .wrap };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nvshlShift a left by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Left shift fills with zero.\n\nvshrShift a right by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\n\nfills with zero.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a,atype, asel );\n\ntb = partSelectSignExtend( b, .u32, bsel );\n\nif ( mode == .clamp  && tb > 32 )  tb = 32;\n\nif ( mode == .wrap )                       tb = tb & 0x1f;\n\nswitch ( vop ){\n\n   case vshl:  tmp = ta << tb;\n\n   case vshr:  tmp = ta >> tb;\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvshl, vshr require sm_20 or higher.\n\nExamples\n\nvshl.s32.u32.u32.clamp  r1, r2, r3;\n\nvshr.u32.u32.u32.wrap   r1, r2, r3.h1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr"
            };

        case "vshr":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr\" target=\"_blank\" rel=\"noopener noreferrer\">vshr <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vshl, vshr</h1><section id=\"scalar-video-instructions-vshl-vshr\">\n\n\n<p>Integer byte/half-word/word left/right shift.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.u32{.sat}.mode       d, a{.asel}, b{.bsel};\nvop.dtype.atype.u32{.sat}.mode.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.u32{.sat}.mode  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vshl, vshr };\n.dtype = .atype = { .u32, .s32 };\n.mode  = { .clamp, .wrap };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<dl class=\"simple\">\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">vshl</span></code></dt><dd><p>Shift <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> left by unsigned amount in <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> with optional saturate, and optional secondary\narithmetic operation or subword data merge. Left shift fills with zero.</p>\n</dd>\n<dt><code class=\"docutils literal notranslate\"><span class=\"pre\">vshr</span></code></dt><dd><p>Shift <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> right by unsigned amount in <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> with optional saturate, and optional secondary\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\nfills with zero.</p>\n</dd>\n</dl>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a,atype, asel );\ntb = partSelectSignExtend( b, .u32, bsel );\nif ( mode == .clamp  &amp;&amp; tb &gt; 32 )  tb = 32;\nif ( mode == .wrap )                       tb = tb &amp; 0x1f;\nswitch ( vop ){\n   case vshl:  tmp = ta &lt;&lt; tb;\n   case vshr:  tmp = ta &gt;&gt; tb;\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vshl</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vshr</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vshl.s32.u32.u32.clamp  r1, r2, r3;\nvshr.u32.u32.u32.wrap   r1, r2, r3.h1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word left/right shift.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.u32{.sat}.mode       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.u32{.sat}.mode.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.u32{.sat}.mode  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vshl, vshr };\n\n.dtype = .atype = { .u32, .s32 };\n\n.mode  = { .clamp, .wrap };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nvshlShift a left by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Left shift fills with zero.\n\nvshrShift a right by unsigned amount in b with optional saturate, and optional secondary\n\narithmetic operation or subword data merge. Signed shift fills with the sign bit, unsigned shift\n\nfills with zero.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a,atype, asel );\n\ntb = partSelectSignExtend( b, .u32, bsel );\n\nif ( mode == .clamp  && tb > 32 )  tb = 32;\n\nif ( mode == .wrap )                       tb = tb & 0x1f;\n\nswitch ( vop ){\n\n   case vshl:  tmp = ta << tb;\n\n   case vshr:  tmp = ta >> tb;\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvshl, vshr require sm_20 or higher.\n\nExamples\n\nvshl.s32.u32.u32.clamp  r1, r2, r3;\n\nvshr.u32.u32.u32.wrap   r1, r2, r3.h1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vshl-vshr"
            };

        case "vsub":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\" target=\"_blank\" rel=\"noopener noreferrer\">vsub <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Scalar Video Instructions: vadd, vsub, vabsdiff, vmin, vmax</h1><section id=\"scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax\">\n\n\n<p>Integer byte/half-word/word addition/subtraction.</p>\n<p><strong>vabsdiff</strong></p>\n<p>Integer byte/half-word/word absolute value of difference.</p>\n<p><strong>vmin, vmax</strong></p>\n<p>Integer byte/half-word/word minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// 32-bit scalar operation, with optional secondary operation\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n.dtype = .atype = .btype = { .u32, .s32 };\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n.op2   = { .add, .min, .max };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Perform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract byte/half-word/word and sign- or zero-extend\n// based on source operand type\nta = partSelectSignExtend( a, atype, asel );\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n    case vadd:     tmp = ta + tb;\n    case vsub:     tmp = ta - tb;\n    case vabsdiff: tmp = | ta - tb |;\n    case vmin:     tmp = MIN( ta, tb );\n    case vmax:     tmp = MAX( ta, tb );\n}\n// saturate, taking into account destination type and merge operations\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 2.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_20</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer byte/half-word/word addition/subtraction.\n\nvabsdiff\n\nInteger byte/half-word/word absolute value of difference.\n\nvmin, vmax\n\nInteger byte/half-word/word minimum/maximum.\n\nSyntax\n\n// 32-bit scalar operation, with optional secondary operation\n\nvop.dtype.atype.btype{.sat}       d, a{.asel}, b{.bsel};\n\nvop.dtype.atype.btype{.sat}.op2   d, a{.asel}, b{.bsel}, c;\n\n// 32-bit scalar operation, with optional data merge\n\nvop.dtype.atype.btype{.sat}  d.dsel, a{.asel}, b{.bsel}, c;\n\n vop   = { vadd, vsub, vabsdiff, vmin, vmax };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.dsel  = .asel  = .bsel  = { .b0, .b1, .b2, .b3, .h0, .h1 };\n\n.op2   = { .add, .min, .max };\n\nDescription\n\nPerform scalar arithmetic operation with optional saturate, and optional secondary arithmetic operation or subword data merge.\n\nSemantics\n\n// extract byte/half-word/word and sign- or zero-extend\n\n// based on source operand type\n\nta = partSelectSignExtend( a, atype, asel );\n\ntb = partSelectSignExtend( b, btype, bsel );\n\nswitch ( vop ) {\n\n    case vadd:     tmp = ta + tb;\n\n    case vsub:     tmp = ta - tb;\n\n    case vabsdiff: tmp = | ta - tb |;\n\n    case vmin:     tmp = MIN( ta, tb );\n\n    case vmax:     tmp = MAX( ta, tb );\n\n}\n\n// saturate, taking into account destination type and merge operations\n\ntmp = optSaturate( tmp, sat, isSigned(dtype), dsel );\n\nd = optSecondaryOp( op2, tmp, c );  // optional secondary operation\n\nd = optMerge( dsel, tmp, c );       // optional merge with c operand\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 2.0.\n\nTarget ISA Notes\n\nvadd, vsub, vabsdiff, vmin, vmax require sm_20 or higher.\n\nExamples\n\nvadd.s32.u32.s32.sat      r1, r2.b0, r3.h0;\n\nvsub.s32.s32.u32.sat      r1, r2.h1, r3.h1;\n\nvabsdiff.s32.s32.s32.sat  r1.h0, r2.b0, r3.b2, c;\n\nvmin.s32.s32.s32.sat.add  r1, r2, r3, c;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#scalar-video-instructions-vadd-vsub-vabsdiff-vmin-vmax"
            };

        case "vsub2":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\" target=\"_blank\" rel=\"noopener noreferrer\">vsub2 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2</h1><section id=\"simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2\">\n\n\n<p>Integer dual half-word SIMD addition/subtraction.</p>\n<p><strong>vavrg2</strong></p>\n<p>Integer dual half-word SIMD average.</p>\n<p><strong>vabsdiff2</strong></p>\n<p>Integer dual half-word SIMD absolute value of difference.</p>\n<p><strong>vmin2, vmax2</strong></p>\n<p>Integer dual half-word SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n   .asel defaults to .h10\n   .bsel defaults to .h32\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Two-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each dual half-word source to the operation are selected from any of the four half-words\nin the two source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected half-words are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are copied into\ndestination <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding half-word from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>\nis copied to <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For half-word positions indicated in mask, the selected half-word results are added to operand\n<code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>, producing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract pairs of half-words and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_2( a, b, .asel, .atype );\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i&lt;2; i++) {\n    switch ( vop2 ) {\n       case vadd2:             t[i] = Va[i] + Vb[i];\n       case vsub2:             t[i] = Va[i] - Vb[i];\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;2; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin2</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax2</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd2.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer dual half-word SIMD addition/subtraction.\n\nvavrg2\n\nInteger dual half-word SIMD average.\n\nvabsdiff2\n\nInteger dual half-word SIMD absolute value of difference.\n\nvmin2, vmax2\n\nInteger dual half-word SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop2.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop2.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\n vop2  = { vadd2, vsub2, vavrg2, vabsdiff2, vmin2, vmax2 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .h0, .h1, .h10 };  // defaults to .h10\n\n.asel  = .bsel  = { .hxy, where x,y are from { 0, 1, 2, 3 } };\n\n   .asel defaults to .h10\n\n   .bsel defaults to .h32\n\nDescription\n\nTwo-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each dual half-word source to the operation are selected from any of the four half-words\n\nin the two source operands a and b using the asel and bsel modifiers.\n\nThe selected half-words are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor half-word positions indicated in mask, the selected half-word results are copied into\n\ndestination d. For all other positions, the corresponding half-word from source operand c\n\nis copied to d.\n\nFor instructions with a secondary accumulate operation:\n\nFor half-word positions indicated in mask, the selected half-word results are added to operand\n\nc, producing a result in d.\n\nSemantics\n\n// extract pairs of half-words and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_2( a, b, .asel, .atype );\n\nVb = extractAndSignExt_2( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_2( c );\n\nfor (i=0; i<2; i++) {\n\n    switch ( vop2 ) {\n\n       case vadd2:             t[i] = Va[i] + Vb[i];\n\n       case vsub2:             t[i] = Va[i] - Vb[i];\n\n       case vavrg2:            if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n       case vabsdiff2:         t[i] = | Va[i] - Vb[i] |;\n\n       case vmin2:             t[i] = MIN( Va[i], Vb[i] );\n\n       case vmax2:             t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S16_MAX, S16_MIN );\n\n        else                   t[i] = CLAMP( t[i], U16_MAX, U16_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<2; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<2; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd2, vsub2, varvg2, vabsdiff2, vmin2, vmax2 require sm_30 or higher.\n\nExamples\n\nvadd2.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub2.s32.s32.s32.sat  r1.h0, r2.h10, r3.h32, r1;\n\nvmin2.s32.u32.u32.add  r1.h10, r2.h00, r3.h22, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd2-vsub2-vavrg2-vabsdiff2-vmin2-vmax2"
            };

        case "vsub4":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\" target=\"_blank\" rel=\"noopener noreferrer\">vsub4 <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>SIMD Video Instructions: vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4</h1><section id=\"simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4\">\n\n\n<p>Integer quad byte SIMD addition/subtraction.</p>\n<p><strong>vavrg4</strong></p>\n<p>Integer quad byte SIMD average.</p>\n<p><strong>vabsdiff4</strong></p>\n<p>Integer quad byte SIMD absolute value of difference.</p>\n<p><strong>vmin4, vmax4</strong></p>\n<p>Integer quad byte SIMD minimum/maximum.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// SIMD instruction with secondary SIMD merge operation\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n.mask  = { .b0,\n           .b1, .b10\n           .b2, .b20, .b21, .b210,\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n    defaults to .b3210\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n   .asel defaults to .b3210\n   .bsel defaults to .b7654\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Four-way SIMD parallel arithmetic operation with secondary operation.</p>\n<p>Elements of each quad byte source to the operation are selected from any of the eight bytes in the\ntwo source operands <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code> using the <code class=\"docutils literal notranslate\"><span class=\"pre\">asel</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">bsel</span></code> modifiers.</p>\n<p>The selected bytes are then operated on in parallel.</p>\n<p>The results are optionally clamped to the appropriate range determined by the destination type\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.</p>\n<p>For instructions with a secondary SIMD merge operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are copied into destination\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>. For all other positions, the corresponding byte from source operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code> is copied to\n<code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p>For instructions with a secondary accumulate operation:</p>\n<ul class=\"simple\">\n<li><p>For byte positions indicated in mask, the selected byte results are added to operand <code class=\"docutils literal notranslate\"><span class=\"pre\">c</span></code>,\nproducing a result in <code class=\"docutils literal notranslate\"><span class=\"pre\">d</span></code>.</p></li>\n</ul>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>// extract quads of bytes and sign- or zero-extend\n// based on operand type\nVa = extractAndSignExt_4( a, b, .asel, .atype );\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\nVc = extractAndSignExt_4( c );\nfor (i=0; i&lt;4; i++) {\n    switch ( vop4 ) {\n        case vadd4:            t[i] = Va[i] + Vb[i];\n        case vsub4:            t[i] = Va[i] - Vb[i];\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) &gt;= 0 ) {\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) &gt;&gt; 1;\n                               } else {\n                                   t[i] = ( Va[i] + Vb[i] ) &gt;&gt; 1;\n                               }\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n    }\n    if (.sat) {\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n    }\n}\n// secondary accumulate or SIMD merge\nmask = extractMaskBits( .mask );\nif (.add) {\n    d = c;\n    for (i=0; i&lt;4; i++) {  d += mask[i] ? t[i] : 0;  }\n} else {\n    d = 0;\n    for (i=0; i&lt;4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n}\n</pre></div>\n</div>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p><code class=\"docutils literal notranslate\"><span class=\"pre\">vadd4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vsub4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">varvg4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vabsdiff4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmin4</span></code>, <code class=\"docutils literal notranslate\"><span class=\"pre\">vmax4</span></code> require <code class=\"docutils literal notranslate\"><span class=\"pre\">sm_30</span></code> or higher.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>vadd4.s32.s32.u32.sat  r1, r2, r3, r1;\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Integer quad byte SIMD addition/subtraction.\n\nvavrg4\n\nInteger quad byte SIMD average.\n\nvabsdiff4\n\nInteger quad byte SIMD absolute value of difference.\n\nvmin4, vmax4\n\nInteger quad byte SIMD minimum/maximum.\n\nSyntax\n\n// SIMD instruction with secondary SIMD merge operation\n\nvop4.dtype.atype.btype{.sat}  d{.mask}, a{.asel}, b{.bsel}, c;\n\n// SIMD instruction with secondary accumulate operation\n\nvop4.dtype.atype.btype.add  d{.mask}, a{.asel}, b{.bsel}, c;\n\nvop4  = { vadd4, vsub4, vavrg4, vabsdiff4, vmin4, vmax4 };\n\n.dtype = .atype = .btype = { .u32, .s32 };\n\n.mask  = { .b0,\n\n           .b1, .b10\n\n           .b2, .b20, .b21, .b210,\n\n           .b3, .b30, .b31, .b310, .b32, .b320, .b321, .b3210 };\n\n    defaults to .b3210\n\n.asel = .bsel = .bxyzw, where x,y,z,w are from { 0, ..., 7 };\n\n   .asel defaults to .b3210\n\n   .bsel defaults to .b7654\n\nDescription\n\nFour-way SIMD parallel arithmetic operation with secondary operation.\n\nElements of each quad byte source to the operation are selected from any of the eight bytes in the\n\ntwo source operands a and b using the asel and bsel modifiers.\n\nThe selected bytes are then operated on in parallel.\n\nThe results are optionally clamped to the appropriate range determined by the destination type\n\n(signed or unsigned). Saturation cannot be used with the secondary accumulate operation.\n\nFor instructions with a secondary SIMD merge operation:\n\nFor byte positions indicated in mask, the selected byte results are copied into destination\n\nd. For all other positions, the corresponding byte from source operand c is copied to\n\nd.\n\nFor instructions with a secondary accumulate operation:\n\nFor byte positions indicated in mask, the selected byte results are added to operand c,\n\nproducing a result in d.\n\nSemantics\n\n// extract quads of bytes and sign- or zero-extend\n\n// based on operand type\n\nVa = extractAndSignExt_4( a, b, .asel, .atype );\n\nVb = extractAndSignExt_4( a, b, .bsel, .btype );\n\nVc = extractAndSignExt_4( c );\n\nfor (i=0; i<4; i++) {\n\n    switch ( vop4 ) {\n\n        case vadd4:            t[i] = Va[i] + Vb[i];\n\n        case vsub4:            t[i] = Va[i] - Vb[i];\n\n        case vavrg4:           if ( ( Va[i] + Vb[i] ) >= 0 ) {\n\n                                   t[i] = ( Va[i] + Vb[i] + 1 ) >> 1;\n\n                               } else {\n\n                                   t[i] = ( Va[i] + Vb[i] ) >> 1;\n\n                               }\n\n        case vabsdiff4:        t[i] = | Va[i] - Vb[i] |;\n\n        case vmin4:            t[i] = MIN( Va[i], Vb[i] );\n\n        case vmax4:            t[i] = MAX( Va[i], Vb[i] );\n\n    }\n\n    if (.sat) {\n\n        if ( .dtype == .s32 )  t[i] = CLAMP( t[i], S8_MAX, S8_MIN );\n\n        else                   t[i] = CLAMP( t[i], U8_MAX, U8_MIN );\n\n    }\n\n}\n\n// secondary accumulate or SIMD merge\n\nmask = extractMaskBits( .mask );\n\nif (.add) {\n\n    d = c;\n\n    for (i=0; i<4; i++) {  d += mask[i] ? t[i] : 0;  }\n\n} else {\n\n    d = 0;\n\n    for (i=0; i<4; i++)  {  d |= mask[i] ? t[i] : Vc[i];  }\n\n}\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.0.\n\nTarget ISA Notes\n\nvadd4, vsub4, varvg4, vabsdiff4, vmin4, vmax4 require sm_30 or higher.\n\nExamples\n\nvadd4.s32.s32.u32.sat  r1, r2, r3, r1;\n\nvsub4.s32.s32.s32.sat  r1.b0, r2.b3210, r3.b7654, r1;\n\nvmin4.s32.u32.u32.add  r1.b00, r2.b0000, r3.b2222, r1;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#simd-video-instructions-vadd4-vsub4-vavrg4-vabsdiff4-vmin4-vmax4"
            };

        case "warpid":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-warpid\" target=\"_blank\" rel=\"noopener noreferrer\">warpid <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Special Registers: %warpid</h1><section id=\"special-registers-warpid\">\n\n\n<p>Warp identifier.</p>\n<p><strong>Syntax (predefined)</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.sreg .u32 %warpid;\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>A predefined, read-only special register that returns the thread\u2019s warp identifier. The warp\nidentifier provides a unique warp number within a CTA but not across CTAs within a grid. The warp\nidentifier will be the same for all threads within a single warp.</p>\n<p>Note that <code class=\"docutils literal notranslate\"><span class=\"pre\">%warpid</span></code> is volatile and returns the location of a thread at the moment when read, but\nits value may change during execution, e.g., due to rescheduling of threads following\npreemption. For this reason, <code class=\"docutils literal notranslate\"><span class=\"pre\">%ctaid</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">%tid</span></code> should be used to compute a virtual warp index\nif such a value is needed in kernel code; <code class=\"docutils literal notranslate\"><span class=\"pre\">%warpid</span></code> is intended mainly to enable profiling and\ndiagnostic code to sample and log information such as work place mapping and load distribution.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.3.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>mov.u32  %r, %warpid;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Warp identifier.\n\nSyntax (predefined)\n\n.sreg .u32 %warpid;\n\nDescription\n\nA predefined, read-only special register that returns the thread\u2019s warp identifier. The warp\n\nidentifier provides a unique warp number within a CTA but not across CTAs within a grid. The warp\n\nidentifier will be the same for all threads within a single warp.\n\nNote that %warpid is volatile and returns the location of a thread at the moment when read, but\n\nits value may change during execution, e.g., due to rescheduling of threads following\n\npreemption. For this reason, %ctaid and %tid should be used to compute a virtual warp index\n\nif such a value is needed in kernel code; %warpid is intended mainly to enable profiling and\n\ndiagnostic code to sample and log information such as work place mapping and load distribution.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.3.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nmov.u32  %r, %warpid;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers-warpid"
            };

        case "weak":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-weak\" target=\"_blank\" rel=\"noopener noreferrer\">weak <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Linking Directives: .weak</h1><section id=\"linking-directives-weak\">\n\n\n<p>Visible (externally) symbol declaration.</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.weak identifier\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Declares identifier to be globally visible but <em>weak</em>. Weak symbols are similar to globally visible\nsymbols, except during linking, weak symbols are only chosen after globally visible symbols during\nsymbol resolution. Unlike globally visible symbols, multiple object files may declare the same weak\nsymbol, and references to a symbol get resolved against a weak symbol only if no global symbols have\nthe same name.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 3.1.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>.weak .func (.reg .b32 val) foo;  // foo will be externally visible\n</pre></div>\n</div>\n</section>",
                "tooltip": "Visible (externally) symbol declaration.\n\nSyntax\n\n.weak identifier\n\nDescription\n\nDeclares identifier to be globally visible but weak. Weak symbols are similar to globally visible\n\nsymbols, except during linking, weak symbols are only chosen after globally visible symbols during\n\nsymbol resolution. Unlike globally visible symbols, multiple object files may declare the same weak\n\nsymbol, and references to a symbol get resolved against a weak symbol only if no global symbols have\n\nthe same name.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 3.1.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\n.weak .func (.reg .b32 val) foo;  // foo will be externally visible\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#linking-directives-weak"
            };

        case "xor":
            return {
                "html": "For more information, visit <a href=\"https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor\" target=\"_blank\" rel=\"noopener noreferrer\">xor <sup><small class=\"fas fa-external-link-alt opens-new-window\" title=\"Opens in a new window\"></small></sup></a>.<h1>Logic and Shift Instructions: xor</h1><section id=\"logic-and-shift-instructions-xor\">\n\n\n<p>Bitwise exclusive-OR (inequality).</p>\n<p><strong>Syntax</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>xor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n</pre></div>\n</div>\n<p><strong>Description</strong></p>\n<p>Compute the bit-wise exclusive-or operation for the bits in <code class=\"docutils literal notranslate\"><span class=\"pre\">a</span></code> and <code class=\"docutils literal notranslate\"><span class=\"pre\">b</span></code>.</p>\n<p><strong>Semantics</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>d = a ^ b;\n</pre></div>\n</div>\n<p><strong>Notes</strong></p>\n<p>The size of the operands must match, but not necessarily the type.</p>\n<p>Allowed types include predicate registers.</p>\n<p><strong>PTX ISA Notes</strong></p>\n<p>Introduced in PTX ISA version 1.0.</p>\n<p><strong>Target ISA Notes</strong></p>\n<p>Supported on all target architectures.</p>\n<p><strong>Examples</strong></p>\n<div class=\"highlight-text notranslate\"><div class=\"highlight\"><pre><span></span>xor.b32  d,q,r;\nxor.b16  d,x,0x0001;\n</pre></div>\n</div>\n</section>",
                "tooltip": "Bitwise exclusive-OR (inequality).\n\nSyntax\n\nxor.type d, a, b;\n\n.type = { .pred, .b16, .b32, .b64 };\n\nDescription\n\nCompute the bit-wise exclusive-or operation for the bits in a and b.\n\nSemantics\n\nd = a ^ b;\n\nNotes\n\nThe size of the operands must match, but not necessarily the type.\n\nAllowed types include predicate registers.\n\nPTX ISA Notes\n\nIntroduced in PTX ISA version 1.0.\n\nTarget ISA Notes\n\nSupported on all target architectures.\n\nExamples\n\nxor.b32  d,q,r;\n\nxor.b16  d,x,0x0001;\n\n ...",
                "url": "https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor"
            };


    }
}