Error: Failed to load processor AutoNav
No macro or processor named 'AutoNav' found

The Standard Duff's Device Trick:

Duff's Version C Code

void duff( int *to, int *from, int count )
{
     register n = (count + 7) / 8;      /* count > 0 assumed */

     switch (count % 8)
     {
     case 0:        do {  *to++ = *from++;
     case 7:              *to++ = *from++;
     case 6:              *to++ = *from++;
     case 5:              *to++ = *from++;
     case 4:              *to++ = *from++;
     case 3:              *to++ = *from++;
     case 2:              *to++ = *from++;
     case 1:              *to++ = *from++;
     } while (--n > 0);
  
}

Duff's Version Assembly

The assembly output from the very good TMS320C6701 C/C++ Compiler shows that the above C code is not 'high enough level' to allow the compiler to perform proper optimisation on a system that has explicit parallel operations. Note the comments are auto-generated by the compiler, and NOP takes a parameter, and the cpu is software pipelined:

;******************************************************************************
;* TMS320C6x ANSI C Codegen                                      Version 4.00 *
;* Date/Time created: Thu Jul 20 21:22:47 2006                                *
;******************************************************************************

;******************************************************************************
;* GLOBAL FILE PARAMETERS                                                     *
;*                                                                            *
;*   Architecture      : TMS320C670x                                          *
;*   Optimization      : Enabled at level 2                                   *
;*   Optimizing for    : Speed                                                *
;*                       Based on options: -o2, no -ms                        *
;*   Endian            : Little                                               *
;*   Interrupt Thrshld : Infinite                                             *
;*   Memory Model      : Large                                                *
;*   Calls to RTS      : Far                                                  *
;*   Pipelining        : Enabled                                              *
;*   Speculative Load  : Enabled (Threshold = 400                            ) *
;*   Memory Aliases    : Presume not aliases (optimistic)                     *
;*   Debug Info        : Debug                                                *
;*                                                                            *
;******************************************************************************

FP .set  A15
DP .set  B14
SP .set  B15
   .global  $bss

;  opt6x -q -t -DI -DI -v6700 -i100 -h2 -n2 -s -O2 c:\windows\temp\TI8_2 c:\windows\temp\TI8_4 -w obj 
   .file "duff.cpp"
   .sect ".text"
   .global  ___duff__FPiT1i
   .sym  ___duff__FPiT1i,___duff__FPiT1i, 32, 2, 0
   .func 3
;----------------------------------------------------------------------
;   3 | void duff( int *to, int *from, int count )                             
;----------------------------------------------------------------------

;******************************************************************************
;* FUNCTION NAME: _duff(int *, int *, int)                                    *
;*                                                                            *
;*   Regs Modified     : A0,A3,B0,B4,B5,B6                                    *
;*   Regs Used         : A0,A3,A4,A6,B0,B3,B4,B5,B6                           *
;*   Local Frame Size  : 0 Args + 0 Auto + 0 Save = 0 byte                    *
;******************************************************************************
___duff__FPiT1i:
;** --------------------------------------------------------------------------*
   .sym  _to,4, 20, 17, 32
   .sym  _from,20, 20, 17, 32
   .sym  _count,6, 4, 17, 32
   .sym  _n,20, 4, 4, 32
   .sym  _from,3, 20, 4, 32
   .sym  _to,4, 20, 4, 32
   .sym  _from,0, 20, 4, 32
   .sym  _count,6, 4, 4, 32
;**   -----------------------    from = from;
;** 5 -----------------------    n = (count+7)/8;
;** 7 -----------------------    switch ( count%8 ) {...};
   .line 2
           MV      .L1X    B4,A3             ; 
   .line 3
;----------------------------------------------------------------------
;   5 | register n = (count + 7) / 8;      /* count > 0 assumed */             
;----------------------------------------------------------------------
           ADD     .L1     7,A6,A0           ; |5| 
           SHR     .S2X    A0,2,B4           ; |5| 
           SHRU    .S2     B4,29,B4          ; |5| 
           ADD     .L2X    B4,A0,B4          ; |5| 
           SHR     .S2     B4,3,B4           ; |5| 
   .line 5
;----------------------------------------------------------------------
;   7 | switch (count % 8)                                                     
;----------------------------------------------------------------------
           SHR     .S1     A6,2,A0           ; |7| 

           SHRU    .S1     A0,29,A0          ; |7| 
||         MVK     .S2     7,B5              ; |7| 

           ADD     .L1     A0,A6,A0          ; |7| 
||         NOT     .L2     B5,B5             ; |7| 

           AND     .L2X    B5,A0,B5          ; |7| 
           SUB     .L2X    A6,B5,B6          ; |7| 
           CMPGTU  .L2     B6,7,B0           ; 
   [ B0]   B       .S1     L10               ; |7| 
           MVKL    .S2     SW1,B5            ; |7| 
           MVKH    .S2     SW1,B5            ; |7| 
   [!B0]   LDW     .D2T2   *+B5[B6],B5       ; |7| 
           NOP             2
           ; BRANCH OCCURS                   ; |7| 
;** --------------------------------------------------------------------------*
           NOP             4
           B       .S2     B5                ; |7| 
           NOP             5

   .sect ".switch"
SW1:  .word L1 ; 0
   .word L9 ; 1
   .word L8 ; 2
   .word L7 ; 3
   .word L6 ; 4
   .word L5 ; 5
   .word L4 ; 6
   .word L3 ; 7
   .sect ".text"
           ; BRANCH OCCURS                   ; |7| 
;** --------------------------------------------------------------------------*
L1:    
;**   -----------------------g1:
;** --------------------------------------------------------------------------*
L2:    
;**   -----------------------g3:
;** 9 -----------------------    *to = *from++;
   .line 7
;----------------------------------------------------------------------
;   9 | case 0:        do {  *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |9| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |9| 
;** --------------------------------------------------------------------------*
L3:    
;**   -----------------------g4:
;** 10   -----------------------    *to = *from++;
   .line 8
;----------------------------------------------------------------------
;  10 | case 7:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |10| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |10| 
;** --------------------------------------------------------------------------*
L4:    
;**   -----------------------g5:
;** 11   -----------------------    *to = *from++;
   .line 9
;----------------------------------------------------------------------
;  11 | case 6:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |11| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |11| 
;** --------------------------------------------------------------------------*
L5:    
;**   -----------------------g6:
;** 12   -----------------------    *to = *from++;
   .line 10
;----------------------------------------------------------------------
;  12 | case 5:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |12| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |12| 
;** --------------------------------------------------------------------------*
L6:    
;**   -----------------------g7:
;** 13   -----------------------    *to = *from++;
   .line 11
;----------------------------------------------------------------------
;  13 | case 4:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |13| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |13| 
;** --------------------------------------------------------------------------*
L7:    
;**   -----------------------g8:
;** 14   -----------------------    *to = *from++;
   .line 12
;----------------------------------------------------------------------
;  14 | case 3:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |14| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |14| 
;** --------------------------------------------------------------------------*
L8:    
;**   -----------------------g9:
;** 15   -----------------------    *to = *from++;
   .line 13
;----------------------------------------------------------------------
;  15 | case 2:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |15| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |15| 
;** --------------------------------------------------------------------------*
L9:    
;**   -----------------------g10:
;** 16   -----------------------    *to = *from++;
;** 17   -----------------------    if ( (--n) > 0 ) goto g3;
   .line 14
;----------------------------------------------------------------------
;  16 | case 1:              *to = *from++;                                    
;----------------------------------------------------------------------
           LDW     .D1T1   *A3++,A0          ; |16| 
           NOP             4
           STW     .D1T1   A0,*A4            ; |16| 
   .line 15
;----------------------------------------------------------------------
;  17 | } while (--n > 0);                                                     
;----------------------------------------------------------------------
           CMPGT   .L2     B4,1,B0           ; 
   [ B0]   B       .S1     L2                ; |17| 
           SUB     .L2     B4,1,B4           ; 
           NOP             4
           ; BRANCH OCCURS                   ; |17| 
;** --------------------------------------------------------------------------*
;**   -----------------------g11:
;**   -----------------------    return;
;** --------------------------------------------------------------------------*
L10:    
   .line 17
           B       .S2     B3                ; |19| 
           NOP             5
           ; BRANCH OCCURS                   ; |19| 
   .endfunc 19,000000000h,0

Non-Duff's Version, with Array Indexing

Whereas the plain old loop without pointer arithmetic:

void nonduff( int *to, int *from, int count )
{
   for( int i=0; i<count; i++ )
   {
      to[i] = from[i];
   }
  
}

Non-Duff's Version, with Array Indexing Assembly

optimizes the best. See the 'PIPED LOOP KERNEL' section. All 6 opcodes in the loop are executed in parallel, in one clock cycle.

;******************************************************************************
;* TMS320C6x ANSI C Codegen                                      Version 4.00 *
;* Date/Time created: Thu Jul 20 21:33:14 2006                                *
;******************************************************************************

;******************************************************************************
;* GLOBAL FILE PARAMETERS                                                     *
;*                                                                            *
;*   Architecture      : TMS320C670x                                          *
;*   Optimization      : Enabled at level 2                                   *
;*   Optimizing for    : Speed                                                *
;*                       Based on options: -o2, no -ms                        *
;*   Endian            : Little                                               *
;*   Interrupt Thrshld : Infinite                                             *
;*   Memory Model      : Large                                                *
;*   Calls to RTS      : Far                                                  *
;*   Pipelining        : Enabled                                              *
;*   Speculative Load  : Enabled (Threshold = 400                            ) *
;*   Memory Aliases    : Presume not aliases (optimistic)                     *
;*   Debug Info        : Debug                                                *
;*                                                                            *
;******************************************************************************

FP .set  A15
DP .set  B14
SP .set  B15
   .global  $bss

;  opt6x -q -t -DI -DI -v6700 -i100 -h2 -n2 -s -O2 c:\windows\temp\TI8_2 c:\windows\temp\TI8_4 -w obj 
   .file "nonduff.cpp"
   .sect ".text"
   .global  ___nonduff__FPiT1i
   .sym  ___nonduff__FPiT1i,___nonduff__FPiT1i, 32, 2, 0
   .func 3
;----------------------------------------------------------------------
;   3 | void duff( int *to, int *from, int count )                             
;----------------------------------------------------------------------

;******************************************************************************
;* FUNCTION NAME: _nonduff(int *, int *, int)                                    *
;*                                                                            *
;*   Regs Modified     : A0,A1,A3,B0,B4,B5                                    *
;*   Regs Used         : A0,A1,A3,A4,A6,B0,B3,B4,B5                           *
;*   Local Frame Size  : 0 Args + 0 Auto + 0 Save = 0 byte                    *
;******************************************************************************
___nonduff__FPiT1i:
;** --------------------------------------------------------------------------*
   .sym  _to,4, 20, 17, 32
   .sym  _from,20, 20, 17, 32
   .sym  _count,6, 4, 17, 32
   .sym  _to,4, 20, 4, 32
   .sym  _from,0, 20, 4, 32
   .sym  _count,21, 4, 4, 32
   .sym  L$1,21, 4, 4, 32
   .sym  U$11,4, 20, 4, 32
   .sym  U$9,0, 20, 4, 32
;** 5 -----------------------    if ( count <= 0 ) goto g4;
   .line 2
           MV      .L1X    B4,A0             ; 
   .line 3
;----------------------------------------------------------------------
;   5 | for( int i=0; i<count; i++ )                                           
;----------------------------------------------------------------------
           CMPGT   .L2X    A6,0,B0           ; 
   [!B0]   B       .S1     L4                ; |5| 
           MV      .L2X    A6,B5             ; 
           NOP             4
           ; BRANCH OCCURS                   ; |5| 
;** --------------------------------------------------------------------------*
;**   -----------------------    U$11 = to;
;**   -----------------------    U$9 = from;
;** 7 -----------------------    L$1 = count;
;**   -----------------------    #pragma MUST_ITERATE(1, 2147483647, 1)
;**   -----------------------g3:
;** 7 -----------------------    *U$11++ = *U$9++;
;** 8 -----------------------    if ( --L$1 ) goto g3;
   .line 5
;----------------------------------------------------------------------
;   7 | to[i] = from[i];                                                       
;----------------------------------------------------------------------
           MVK     .S1     0x1,A1            ; init prolog collapse predicate
           NOP             1
;*----------------------------------------------------------------------------*
;*   SOFTWARE PIPELINE INFORMATION
;*
;*      Known Minimum Trip Count         : 1
;*      Known Max Trip Count Factor      : 1
;*      Loop Carried Dependency Bound(^) : 0
;*      Unpartitioned Resource Bound     : 1
;*      Partitioned Resource Bound(*)    : 1
;*      Resource Partition:
;*                                A-side   B-side
;*      .L units                     0        0     
;*      .S units                     1*       0     
;*      .D units                     1*       1*    
;*      .M units                     0        0     
;*      .X cross paths               0        1*    
;*      .T address paths             1*       1*    
;*      Long read paths              0        1*    
;*      Long write paths             0        0     
;*      Logical  ops (.LS)           0        1     (.L or .S unit)
;*      Addition ops (.LSD)          0        1     (.L or .S or .D unit)
;*      Bound(.L .S .LS)             1*       1*    
;*      Bound(.L .S .D .LS .LSD)     1*       1*    
;*
;*      Searching for software pipeline schedule at ...
;*         ii = 1  Schedule found with 7 iterations in parallel
;*      done
;*
;*      Collapsed epilog stages     : 6
;*      Prolog not entirely removed
;*      Collapsed prolog stages     : 1
;*
;*      Minimum required memory pad : 24 bytes
;*
;*      Minimum safe trip count     : 1
;*----------------------------------------------------------------------------*
;*   SINGLE SCHEDULED ITERATION
;*
;*   C23:
;*              LDW     .D1T1   *A0++,A3          ; |7| 
;*   || [ B0]   SUB     .S2     B0,1,B0           ; |8| 
;*      [ B0]   B       .S1     C23               ; |8| 
;*              NOP             3
;*              MV      .L2X    A3,B5             ; Define a twin register
;*              STW     .D2T2   B5,*B4++          ; |7| 
;*              ; BRANCH OCCURS                   ; |8| 
;*----------------------------------------------------------------------------*
L1:    ; PIPED LOOP PROLOG

           SUB     .L2     B5,1,B0           ; 
||         LDW     .D1T1   *A0++,A3          ; (P) |7| 
||         B       .S1     L2                ; (P) |8| 

   [ B0]   SUB     .L2     B0,1,B0           ; (P) @@|8| 
|| [ B0]   B       .S1     L2                ; (P) @|8| 
||         LDW     .D1T1   *A0++,A3          ; (P) @|7| 

   [ B0]   SUB     .L2     B0,1,B0           ; (P) @@@|8| 
||         LDW     .D1T1   *A0++,A3          ; (P) @@|7| 
|| [ B0]   B       .S1     L2                ; (P) @@|8| 

   [ B0]   SUB     .L2     B0,1,B0           ; (P) @@@@|8| 
|| [ B0]   B       .S1     L2                ; (P) @@@|8| 
||         LDW     .D1T1   *A0++,A3          ; (P) @@@|7| 

           MV      .L2X    A4,B4
|| [ B0]   SUB     .S2     B0,1,B0           ; (P) @@@@@|8| 
||         LDW     .D1T1   *A0++,A3          ; (P) @@@@|7| 
|| [ B0]   B       .S1     L2                ; (P) @@@@|8| 

;** --------------------------------------------------------------------------*
L2:    ; PIPED LOOP KERNEL

   [ A1]   SUB     .L1     A1,1,A1           ; 
|| [!A1]   STW     .D2T2   B5,*B4++          ; |7| 
||         MV      .L2X    A3,B5             ; @Define a twin register
|| [ B0]   B       .S1     L2                ; @@@@@|8| 
|| [ B0]   SUB     .S2     B0,1,B0           ; @@@@@@|8| 
||         LDW     .D1T1   *A0++,A3          ; @@@@@@|7| 

;** --------------------------------------------------------------------------*
L3:    ; PIPED LOOP EPILOG
;** --------------------------------------------------------------------------*
;**   -----------------------g4:
;**   -----------------------    return;
;** --------------------------------------------------------------------------*
L4:    
           NOP             2
   .line 8
           B       .S2     B3                ; |10| 
           NOP             5
           ; BRANCH OCCURS                   ; |10| 
   .endfunc 10,000000000h,0