The Standard Duff's Device Trick:
Duff's Version C Code
void duff( int *to, int *from, int count ) { register n = (count + 7) / 8; /* count > 0 assumed */ switch (count % 8) { case 0: do { *to++ = *from++; case 7: *to++ = *from++; case 6: *to++ = *from++; case 5: *to++ = *from++; case 4: *to++ = *from++; case 3: *to++ = *from++; case 2: *to++ = *from++; case 1: *to++ = *from++; } while (--n > 0); }
Duff's Version Assembly
The assembly output from the very good TMS320C6701 C/C++ Compiler shows that the above C code is not 'high enough level' to allow the compiler to perform proper optimisation on a system that has explicit parallel operations. Note the comments are auto-generated by the compiler, and NOP takes a parameter, and the cpu is software pipelined:
;****************************************************************************** ;* TMS320C6x ANSI C Codegen Version 4.00 * ;* Date/Time created: Thu Jul 20 21:22:47 2006 * ;****************************************************************************** ;****************************************************************************** ;* GLOBAL FILE PARAMETERS * ;* * ;* Architecture : TMS320C670x * ;* Optimization : Enabled at level 2 * ;* Optimizing for : Speed * ;* Based on options: -o2, no -ms * ;* Endian : Little * ;* Interrupt Thrshld : Infinite * ;* Memory Model : Large * ;* Calls to RTS : Far * ;* Pipelining : Enabled * ;* Speculative Load : Enabled (Threshold = 400 ) * ;* Memory Aliases : Presume not aliases (optimistic) * ;* Debug Info : Debug * ;* * ;****************************************************************************** FP .set A15 DP .set B14 SP .set B15 .global $bss ; opt6x -q -t -DI -DI -v6700 -i100 -h2 -n2 -s -O2 c:\windows\temp\TI8_2 c:\windows\temp\TI8_4 -w obj .file "duff.cpp" .sect ".text" .global ___duff__FPiT1i .sym ___duff__FPiT1i,___duff__FPiT1i, 32, 2, 0 .func 3 ;---------------------------------------------------------------------- ; 3 | void duff( int *to, int *from, int count ) ;---------------------------------------------------------------------- ;****************************************************************************** ;* FUNCTION NAME: _duff(int *, int *, int) * ;* * ;* Regs Modified : A0,A3,B0,B4,B5,B6 * ;* Regs Used : A0,A3,A4,A6,B0,B3,B4,B5,B6 * ;* Local Frame Size : 0 Args + 0 Auto + 0 Save = 0 byte * ;****************************************************************************** ___duff__FPiT1i: ;** --------------------------------------------------------------------------* .sym _to,4, 20, 17, 32 .sym _from,20, 20, 17, 32 .sym _count,6, 4, 17, 32 .sym _n,20, 4, 4, 32 .sym _from,3, 20, 4, 32 .sym _to,4, 20, 4, 32 .sym _from,0, 20, 4, 32 .sym _count,6, 4, 4, 32 ;** ----------------------- from = from; ;** 5 ----------------------- n = (count+7)/8; ;** 7 ----------------------- switch ( count%8 ) {...}; .line 2 MV .L1X B4,A3 ; .line 3 ;---------------------------------------------------------------------- ; 5 | register n = (count + 7) / 8; /* count > 0 assumed */ ;---------------------------------------------------------------------- ADD .L1 7,A6,A0 ; |5| SHR .S2X A0,2,B4 ; |5| SHRU .S2 B4,29,B4 ; |5| ADD .L2X B4,A0,B4 ; |5| SHR .S2 B4,3,B4 ; |5| .line 5 ;---------------------------------------------------------------------- ; 7 | switch (count % 8) ;---------------------------------------------------------------------- SHR .S1 A6,2,A0 ; |7| SHRU .S1 A0,29,A0 ; |7| || MVK .S2 7,B5 ; |7| ADD .L1 A0,A6,A0 ; |7| || NOT .L2 B5,B5 ; |7| AND .L2X B5,A0,B5 ; |7| SUB .L2X A6,B5,B6 ; |7| CMPGTU .L2 B6,7,B0 ; [ B0] B .S1 L10 ; |7| MVKL .S2 SW1,B5 ; |7| MVKH .S2 SW1,B5 ; |7| [!B0] LDW .D2T2 *+B5[B6],B5 ; |7| NOP 2 ; BRANCH OCCURS ; |7| ;** --------------------------------------------------------------------------* NOP 4 B .S2 B5 ; |7| NOP 5 .sect ".switch" SW1: .word L1 ; 0 .word L9 ; 1 .word L8 ; 2 .word L7 ; 3 .word L6 ; 4 .word L5 ; 5 .word L4 ; 6 .word L3 ; 7 .sect ".text" ; BRANCH OCCURS ; |7| ;** --------------------------------------------------------------------------* L1: ;** -----------------------g1: ;** --------------------------------------------------------------------------* L2: ;** -----------------------g3: ;** 9 ----------------------- *to = *from++; .line 7 ;---------------------------------------------------------------------- ; 9 | case 0: do { *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |9| NOP 4 STW .D1T1 A0,*A4 ; |9| ;** --------------------------------------------------------------------------* L3: ;** -----------------------g4: ;** 10 ----------------------- *to = *from++; .line 8 ;---------------------------------------------------------------------- ; 10 | case 7: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |10| NOP 4 STW .D1T1 A0,*A4 ; |10| ;** --------------------------------------------------------------------------* L4: ;** -----------------------g5: ;** 11 ----------------------- *to = *from++; .line 9 ;---------------------------------------------------------------------- ; 11 | case 6: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |11| NOP 4 STW .D1T1 A0,*A4 ; |11| ;** --------------------------------------------------------------------------* L5: ;** -----------------------g6: ;** 12 ----------------------- *to = *from++; .line 10 ;---------------------------------------------------------------------- ; 12 | case 5: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |12| NOP 4 STW .D1T1 A0,*A4 ; |12| ;** --------------------------------------------------------------------------* L6: ;** -----------------------g7: ;** 13 ----------------------- *to = *from++; .line 11 ;---------------------------------------------------------------------- ; 13 | case 4: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |13| NOP 4 STW .D1T1 A0,*A4 ; |13| ;** --------------------------------------------------------------------------* L7: ;** -----------------------g8: ;** 14 ----------------------- *to = *from++; .line 12 ;---------------------------------------------------------------------- ; 14 | case 3: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |14| NOP 4 STW .D1T1 A0,*A4 ; |14| ;** --------------------------------------------------------------------------* L8: ;** -----------------------g9: ;** 15 ----------------------- *to = *from++; .line 13 ;---------------------------------------------------------------------- ; 15 | case 2: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |15| NOP 4 STW .D1T1 A0,*A4 ; |15| ;** --------------------------------------------------------------------------* L9: ;** -----------------------g10: ;** 16 ----------------------- *to = *from++; ;** 17 ----------------------- if ( (--n) > 0 ) goto g3; .line 14 ;---------------------------------------------------------------------- ; 16 | case 1: *to = *from++; ;---------------------------------------------------------------------- LDW .D1T1 *A3++,A0 ; |16| NOP 4 STW .D1T1 A0,*A4 ; |16| .line 15 ;---------------------------------------------------------------------- ; 17 | } while (--n > 0); ;---------------------------------------------------------------------- CMPGT .L2 B4,1,B0 ; [ B0] B .S1 L2 ; |17| SUB .L2 B4,1,B4 ; NOP 4 ; BRANCH OCCURS ; |17| ;** --------------------------------------------------------------------------* ;** -----------------------g11: ;** ----------------------- return; ;** --------------------------------------------------------------------------* L10: .line 17 B .S2 B3 ; |19| NOP 5 ; BRANCH OCCURS ; |19| .endfunc 19,000000000h,0
Non-Duff's Version, with Array Indexing
Whereas the plain old loop without pointer arithmetic:
void nonduff( int *to, int *from, int count ) { for( int i=0; i<count; i++ ) { to[i] = from[i]; } }
Non-Duff's Version, with Array Indexing Assembly
optimizes the best. See the 'PIPED LOOP KERNEL' section. All 6 opcodes in the loop are executed in parallel, in one clock cycle.
;****************************************************************************** ;* TMS320C6x ANSI C Codegen Version 4.00 * ;* Date/Time created: Thu Jul 20 21:33:14 2006 * ;****************************************************************************** ;****************************************************************************** ;* GLOBAL FILE PARAMETERS * ;* * ;* Architecture : TMS320C670x * ;* Optimization : Enabled at level 2 * ;* Optimizing for : Speed * ;* Based on options: -o2, no -ms * ;* Endian : Little * ;* Interrupt Thrshld : Infinite * ;* Memory Model : Large * ;* Calls to RTS : Far * ;* Pipelining : Enabled * ;* Speculative Load : Enabled (Threshold = 400 ) * ;* Memory Aliases : Presume not aliases (optimistic) * ;* Debug Info : Debug * ;* * ;****************************************************************************** FP .set A15 DP .set B14 SP .set B15 .global $bss ; opt6x -q -t -DI -DI -v6700 -i100 -h2 -n2 -s -O2 c:\windows\temp\TI8_2 c:\windows\temp\TI8_4 -w obj .file "nonduff.cpp" .sect ".text" .global ___nonduff__FPiT1i .sym ___nonduff__FPiT1i,___nonduff__FPiT1i, 32, 2, 0 .func 3 ;---------------------------------------------------------------------- ; 3 | void duff( int *to, int *from, int count ) ;---------------------------------------------------------------------- ;****************************************************************************** ;* FUNCTION NAME: _nonduff(int *, int *, int) * ;* * ;* Regs Modified : A0,A1,A3,B0,B4,B5 * ;* Regs Used : A0,A1,A3,A4,A6,B0,B3,B4,B5 * ;* Local Frame Size : 0 Args + 0 Auto + 0 Save = 0 byte * ;****************************************************************************** ___nonduff__FPiT1i: ;** --------------------------------------------------------------------------* .sym _to,4, 20, 17, 32 .sym _from,20, 20, 17, 32 .sym _count,6, 4, 17, 32 .sym _to,4, 20, 4, 32 .sym _from,0, 20, 4, 32 .sym _count,21, 4, 4, 32 .sym L$1,21, 4, 4, 32 .sym U$11,4, 20, 4, 32 .sym U$9,0, 20, 4, 32 ;** 5 ----------------------- if ( count <= 0 ) goto g4; .line 2 MV .L1X B4,A0 ; .line 3 ;---------------------------------------------------------------------- ; 5 | for( int i=0; i<count; i++ ) ;---------------------------------------------------------------------- CMPGT .L2X A6,0,B0 ; [!B0] B .S1 L4 ; |5| MV .L2X A6,B5 ; NOP 4 ; BRANCH OCCURS ; |5| ;** --------------------------------------------------------------------------* ;** ----------------------- U$11 = to; ;** ----------------------- U$9 = from; ;** 7 ----------------------- L$1 = count; ;** ----------------------- #pragma MUST_ITERATE(1, 2147483647, 1) ;** -----------------------g3: ;** 7 ----------------------- *U$11++ = *U$9++; ;** 8 ----------------------- if ( --L$1 ) goto g3; .line 5 ;---------------------------------------------------------------------- ; 7 | to[i] = from[i]; ;---------------------------------------------------------------------- MVK .S1 0x1,A1 ; init prolog collapse predicate NOP 1 ;*----------------------------------------------------------------------------* ;* SOFTWARE PIPELINE INFORMATION ;* ;* Known Minimum Trip Count : 1 ;* Known Max Trip Count Factor : 1 ;* Loop Carried Dependency Bound(^) : 0 ;* Unpartitioned Resource Bound : 1 ;* Partitioned Resource Bound(*) : 1 ;* Resource Partition: ;* A-side B-side ;* .L units 0 0 ;* .S units 1* 0 ;* .D units 1* 1* ;* .M units 0 0 ;* .X cross paths 0 1* ;* .T address paths 1* 1* ;* Long read paths 0 1* ;* Long write paths 0 0 ;* Logical ops (.LS) 0 1 (.L or .S unit) ;* Addition ops (.LSD) 0 1 (.L or .S or .D unit) ;* Bound(.L .S .LS) 1* 1* ;* Bound(.L .S .D .LS .LSD) 1* 1* ;* ;* Searching for software pipeline schedule at ... ;* ii = 1 Schedule found with 7 iterations in parallel ;* done ;* ;* Collapsed epilog stages : 6 ;* Prolog not entirely removed ;* Collapsed prolog stages : 1 ;* ;* Minimum required memory pad : 24 bytes ;* ;* Minimum safe trip count : 1 ;*----------------------------------------------------------------------------* ;* SINGLE SCHEDULED ITERATION ;* ;* C23: ;* LDW .D1T1 *A0++,A3 ; |7| ;* || [ B0] SUB .S2 B0,1,B0 ; |8| ;* [ B0] B .S1 C23 ; |8| ;* NOP 3 ;* MV .L2X A3,B5 ; Define a twin register ;* STW .D2T2 B5,*B4++ ; |7| ;* ; BRANCH OCCURS ; |8| ;*----------------------------------------------------------------------------* L1: ; PIPED LOOP PROLOG SUB .L2 B5,1,B0 ; || LDW .D1T1 *A0++,A3 ; (P) |7| || B .S1 L2 ; (P) |8| [ B0] SUB .L2 B0,1,B0 ; (P) @@|8| || [ B0] B .S1 L2 ; (P) @|8| || LDW .D1T1 *A0++,A3 ; (P) @|7| [ B0] SUB .L2 B0,1,B0 ; (P) @@@|8| || LDW .D1T1 *A0++,A3 ; (P) @@|7| || [ B0] B .S1 L2 ; (P) @@|8| [ B0] SUB .L2 B0,1,B0 ; (P) @@@@|8| || [ B0] B .S1 L2 ; (P) @@@|8| || LDW .D1T1 *A0++,A3 ; (P) @@@|7| MV .L2X A4,B4 || [ B0] SUB .S2 B0,1,B0 ; (P) @@@@@|8| || LDW .D1T1 *A0++,A3 ; (P) @@@@|7| || [ B0] B .S1 L2 ; (P) @@@@|8| ;** --------------------------------------------------------------------------* L2: ; PIPED LOOP KERNEL [ A1] SUB .L1 A1,1,A1 ; || [!A1] STW .D2T2 B5,*B4++ ; |7| || MV .L2X A3,B5 ; @Define a twin register || [ B0] B .S1 L2 ; @@@@@|8| || [ B0] SUB .S2 B0,1,B0 ; @@@@@@|8| || LDW .D1T1 *A0++,A3 ; @@@@@@|7| ;** --------------------------------------------------------------------------* L3: ; PIPED LOOP EPILOG ;** --------------------------------------------------------------------------* ;** -----------------------g4: ;** ----------------------- return; ;** --------------------------------------------------------------------------* L4: NOP 2 .line 8 B .S2 B3 ; |10| NOP 5 ; BRANCH OCCURS ; |10| .endfunc 10,000000000h,0
