Back to home page

LXR

 
 

    


0001 /*
0002  * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
0003  *
0004  *
0005  * This program is free software; you can redistribute it and/or modify
0006  * it under the terms of the GNU General Public License version 2 and
0007  * only version 2 as published by the Free Software Foundation.
0008  *
0009  * This program is distributed in the hope that it will be useful,
0010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0012  * GNU General Public License for more details.
0013  *
0014  * You should have received a copy of the GNU General Public License
0015  * along with this program; if not, write to the Free Software
0016  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
0017  * 02110-1301, USA.
0018  */
0019 
0020 /*
0021  * Description
0022  *
0023  *   library function for memcpy where length bytes are copied from
0024  *   ptr_in to ptr_out. ptr_out is returned unchanged.
0025  *   Allows any combination of alignment on input and output pointers
0026  *   and length from 0 to 2^32-1
0027  *
0028  * Restrictions
0029  *   The arrays should not overlap, the program will produce undefined output
0030  *   if they do.
0031  *   For blocks less than 16 bytes a byte by byte copy is performed. For
0032  *   8byte alignments, and length multiples, a dword copy is performed up to
0033  *   96bytes
0034  * History
0035  *
0036  *   DJH  5/15/09 Initial version 1.0
0037  *   DJH  6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19
0038  *   DJH  7/12/09 Version 1.2 optimized codesize down to 760 was 840
0039  *   DJH 10/14/09 Version 1.3 added special loop for aligned case, was
0040  *                            overreading bloated codesize back up to 892
0041  *   DJH  4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads
0042  *                            occurring if only 1 left outstanding, fixes bug
0043  *                            # 3888, corrected for all alignments. Peeled off
0044  *                            1 32byte chunk from kernel loop and extended 8byte
0045  *                            loop at end to solve all combinations and prevent
0046  *                            over read.  Fixed Ldword_loop_prolog to prevent
0047  *                            overread for blocks less than 48bytes. Reduced
0048  *                            codesize to 752 bytes
0049  *   DJH  4/21/10 version 1.5 1.4 fix broke code for input block ends not
0050  *                            aligned to dword boundaries,underwriting by 1
0051  *                            byte, added detection for this and fixed. A
0052  *                            little bloat.
0053  *   DJH  4/23/10 version 1.6 corrected stack error, R20 was not being restored
0054  *                            always, fixed the error of R20 being modified
0055  *                            before it was being saved
0056  * Natural c model
0057  * ===============
0058  * void * memcpy(char * ptr_out, char * ptr_in, int length) {
0059  *   int i;
0060  *   if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; }
0061  *   return(ptr_out);
0062  * }
0063  *
0064  * Optimized memcpy function
0065  * =========================
0066  * void * memcpy(char * ptr_out, char * ptr_in, int len) {
0067  *   int i, prolog, kernel, epilog, mask;
0068  *   u8 offset;
0069  *   s64 data0, dataF8, data70;
0070  *
0071  *   s64 * ptr8_in;
0072  *   s64 * ptr8_out;
0073  *   s32 * ptr4;
0074  *   s16 * ptr2;
0075  *
0076  *   offset = ((int) ptr_in) & 7;
0077  *   ptr8_in = (s64 *) &ptr_in[-offset];   //read in the aligned pointers
0078  *
0079  *   data70 = *ptr8_in++;
0080  *   dataF8 = *ptr8_in++;
0081  *
0082  *   data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
0083  *
0084  *   prolog = 32 - ((int) ptr_out);
0085  *   mask  = 0x7fffffff >> HEXAGON_R_cl0_R(len);
0086  *   prolog = prolog & mask;
0087  *   kernel = len - prolog;
0088  *   epilog = kernel & 0x1F;
0089  *   kernel = kernel>>5;
0090  *
0091  *   if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;}
0092  *   ptr2 = (s16 *) &ptr_out[0];
0093  *   if (prolog & 2) { ptr2[0] = (u16) data0;  data0 >>= 16; ptr_out += 2;}
0094  *   ptr4 = (s32 *) &ptr_out[0];
0095  *   if (prolog & 4) { ptr4[0] = (u32) data0;  data0 >>= 32; ptr_out += 4;}
0096  *
0097  *   offset = offset + (prolog & 7);
0098  *   if (offset >= 8) {
0099  *     data70 = dataF8;
0100  *     dataF8 = *ptr8_in++;
0101  *   }
0102  *   offset = offset & 0x7;
0103  *
0104  *   prolog = prolog >> 3;
0105  *   if (prolog) for (i=0; i < prolog; i++) {
0106  *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
0107  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
0108  *       data70 = dataF8;
0109  *       dataF8 = *ptr8_in++;
0110  *   }
0111  *   if(kernel) { kernel -= 1; epilog += 32; }
0112  *   if(kernel) for(i=0; i < kernel; i++) {
0113  *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
0114  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
0115  *       data70 = *ptr8_in++;
0116  *
0117  *       data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
0118  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
0119  *       dataF8 = *ptr8_in++;
0120  *
0121  *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
0122  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
0123  *       data70 = *ptr8_in++;
0124  *
0125  *       data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
0126  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
0127  *       dataF8 = *ptr8_in++;
0128  *   }
0129  *   epilogdws = epilog >> 3;
0130  *   if (epilogdws) for (i=0; i < epilogdws; i++) {
0131  *       data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
0132  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
0133  *       data70 = dataF8;
0134  *       dataF8 = *ptr8_in++;
0135  *   }
0136  *   data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
0137  *
0138  *   ptr4 = (s32 *) &ptr_out[0];
0139  *   if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
0140  *   ptr2 = (s16 *) &ptr_out[0];
0141  *   if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
0142  *   if (epilog & 1) { *ptr_out++ = (u8) data0; }
0143  *
0144  *   return(ptr_out - length);
0145  * }
0146  *
0147  * Codesize : 784 bytes
0148  */
0149 
0150 
0151 #define ptr_out     R0  /*  destination  pounter  */
0152 #define ptr_in      R1  /*  source pointer  */
0153 #define len     R2  /*  length of copy in bytes  */
0154 
0155 #define data70      R13:12  /*  lo 8 bytes of non-aligned transfer  */
0156 #define dataF8      R11:10  /*  hi 8 bytes of non-aligned transfer  */
0157 #define ldata0      R7:6    /*  even 8 bytes chunks  */
0158 #define ldata1      R25:24  /*  odd 8 bytes chunks  */
0159 #define data1       R7  /*  lower 8 bytes of ldata1  */
0160 #define data0       R6  /*  lower 8 bytes of ldata0  */
0161 
0162 #define ifbyte      p0  /*  if transfer has bytes in epilog/prolog  */
0163 #define ifhword     p0  /*  if transfer has shorts in epilog/prolog  */
0164 #define ifword      p0  /*  if transfer has words in epilog/prolog  */
0165 #define noprolog    p0  /*  no prolog, xfer starts at 32byte  */
0166 #define nokernel    p1  /*  no 32byte multiple block in the transfer  */
0167 #define noepilog    p0  /*  no epilog, xfer ends on 32byte boundary  */
0168 #define align       p2  /*  alignment of input rel to 8byte boundary  */
0169 #define kernel1     p0  /*  kernel count == 1  */
0170 
0171 #define dalign      R25 /*  rel alignment of input to output data  */
0172 #define star3       R16 /*  number bytes in prolog - dwords  */
0173 #define rest        R8  /*  length - prolog bytes  */
0174 #define back        R7  /*  nr bytes > dword boundary in src block  */
0175 #define epilog      R3  /*  bytes in epilog  */
0176 #define inc     R15:14  /*  inc kernel by -1 and defetch ptr by 32  */
0177 #define kernel      R4  /*  number of 32byte chunks in kernel  */
0178 #define ptr_in_p_128    R5  /*  pointer for prefetch of input data  */
0179 #define mask        R8  /*  mask used to determine prolog size  */
0180 #define shift       R8  /*  used to work a shifter to extract bytes  */
0181 #define shift2      R5  /*  in epilog to workshifter to extract bytes */
0182 #define prolog      R15 /*  bytes in  prolog  */
0183 #define epilogdws   R15 /*  number dwords in epilog  */
0184 #define shiftb      R14 /*  used to extract bytes  */
0185 #define offset      R9  /*  same as align in reg  */
0186 #define ptr_out_p_32    R17 /*  pointer to output dczero  */
0187 #define align888    R14 /*  if simple dword loop can be used  */
0188 #define len8        R9  /*  number of dwords in length  */
0189 #define over        R20 /*  nr of bytes > last inp buf dword boundary */
0190 
0191 #define ptr_in_p_128kernel  R5:4    /*  packed fetch pointer & kernel cnt */
0192 
0193     .section .text
0194     .p2align 4
0195         .global memcpy
0196         .type memcpy, @function
0197 memcpy:
0198 {
0199     p2 = cmp.eq(len, #0);       /*  =0 */
0200     align888 = or(ptr_in, ptr_out); /*  %8 < 97 */
0201     p0 = cmp.gtu(len, #23);     /*  %1, <24 */
0202     p1 = cmp.eq(ptr_in, ptr_out);   /*  attempt to overwrite self */
0203 }
0204 {
0205     p1 = or(p2, p1);
0206     p3 = cmp.gtu(len, #95);     /*  %8 < 97 */
0207     align888 = or(align888, len);   /*  %8 < 97 */
0208     len8 = lsr(len, #3);        /*  %8 < 97 */
0209 }
0210 {
0211     dcfetch(ptr_in);        /*  zero/ptrin=ptrout causes fetch */
0212     p2 = bitsclr(align888, #7); /*  %8 < 97  */
0213     if(p1) jumpr r31;       /*  =0  */
0214 }
0215 {
0216     p2 = and(p2,!p3);           /*  %8 < 97  */
0217     if (p2.new) len = add(len, #-8);    /*  %8 < 97  */
0218     if (p2.new) jump:NT .Ldwordaligned;     /*  %8 < 97  */
0219 }
0220 {
0221     if(!p0) jump .Lbytes23orless;   /*  %1, <24  */
0222     mask.l = #LO(0x7fffffff);
0223     /*  all bytes before line multiples of data  */
0224     prolog = sub(#0, ptr_out);
0225 }
0226 {
0227     /*  save r31 on stack, decrement sp by 16  */
0228     allocframe(#24);
0229     mask.h = #HI(0x7fffffff);
0230     ptr_in_p_128 = add(ptr_in, #32);
0231     back = cl0(len);
0232 }
0233 {
0234     memd(sp+#0) = R17:16;       /*  save r16,r17 on stack6  */
0235     r31.l = #LO(.Lmemcpy_return);   /*  set up final return pointer  */
0236     prolog &= lsr(mask, back);
0237     offset = and(ptr_in, #7);
0238 }
0239 {
0240     memd(sp+#8) = R25:24;       /*  save r25,r24 on stack  */
0241     dalign = sub(ptr_out, ptr_in);
0242     r31.h = #HI(.Lmemcpy_return);   /*  set up final return pointer  */
0243 }
0244 {
0245     /*  see if there if input buffer end if aligned  */
0246     over = add(len, ptr_in);
0247     back = add(len, offset);
0248     memd(sp+#16) = R21:20;      /*  save r20,r21 on stack  */
0249 }
0250 {
0251     noprolog = bitsclr(prolog, #7);
0252     prolog = and(prolog, #31);
0253     dcfetch(ptr_in_p_128);
0254     ptr_in_p_128 = add(ptr_in_p_128, #32);
0255 }
0256 {
0257     kernel = sub(len, prolog);
0258     shift = asl(prolog, #3);
0259     star3 = and(prolog, #7);
0260     ptr_in = and(ptr_in, #-8);
0261 }
0262 {
0263     prolog = lsr(prolog, #3);
0264     epilog = and(kernel, #31);
0265     ptr_out_p_32 = add(ptr_out, prolog);
0266     over = and(over, #7);
0267 }
0268 {
0269     p3 = cmp.gtu(back, #8);
0270     kernel = lsr(kernel, #5);
0271     dcfetch(ptr_in_p_128);
0272     ptr_in_p_128 = add(ptr_in_p_128, #32);
0273 }
0274 {
0275     p1 = cmp.eq(prolog, #0);
0276     if(!p1.new) prolog = add(prolog, #1);
0277     dcfetch(ptr_in_p_128);  /*  reserve the line 64bytes on  */
0278     ptr_in_p_128 = add(ptr_in_p_128, #32);
0279 }
0280 {
0281     nokernel = cmp.eq(kernel,#0);
0282     dcfetch(ptr_in_p_128);  /* reserve the line 64bytes on  */
0283     ptr_in_p_128 = add(ptr_in_p_128, #32);
0284     shiftb = and(shift, #8);
0285 }
0286 {
0287     dcfetch(ptr_in_p_128);      /*  reserve the line 64bytes on  */
0288     ptr_in_p_128 = add(ptr_in_p_128, #32);
0289     if(nokernel) jump .Lskip64;
0290     p2 = cmp.eq(kernel, #1);    /*  skip ovr if kernel == 0  */
0291 }
0292 {
0293     dczeroa(ptr_out_p_32);
0294     /*  don't advance pointer  */
0295     if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32);
0296 }
0297 {
0298     dalign = and(dalign, #31);
0299     dczeroa(ptr_out_p_32);
0300 }
0301 .Lskip64:
0302 {
0303     data70 = memd(ptr_in++#16);
0304     if(p3) dataF8 = memd(ptr_in+#8);
0305     if(noprolog) jump .Lnoprolog32;
0306     align = offset;
0307 }
0308 /*  upto initial 7 bytes  */
0309 {
0310     ldata0 = valignb(dataF8, data70, align);
0311     ifbyte = tstbit(shift,#3);
0312     offset = add(offset, star3);
0313 }
0314 {
0315     if(ifbyte) memb(ptr_out++#1) = data0;
0316     ldata0 = lsr(ldata0, shiftb);
0317     shiftb = and(shift, #16);
0318     ifhword = tstbit(shift,#4);
0319 }
0320 {
0321     if(ifhword) memh(ptr_out++#2) = data0;
0322     ldata0 = lsr(ldata0, shiftb);
0323     ifword = tstbit(shift,#5);
0324     p2 = cmp.gtu(offset, #7);
0325 }
0326 {
0327     if(ifword) memw(ptr_out++#4) = data0;
0328     if(p2) data70 = dataF8;
0329     if(p2) dataF8 = memd(ptr_in++#8);   /*  another 8 bytes  */
0330     align = offset;
0331 }
0332 .Lnoprolog32:
0333 {
0334     p3 = sp1loop0(.Ldword_loop_prolog, prolog)
0335     rest = sub(len, star3); /*  whats left after the loop  */
0336     p0 = cmp.gt(over, #0);
0337 }
0338     if(p0) rest = add(rest, #16);
0339 .Ldword_loop_prolog:
0340 {
0341     if(p3) memd(ptr_out++#8) = ldata0;
0342     ldata0 = valignb(dataF8, data70, align);
0343     p0 = cmp.gt(rest, #16);
0344 }
0345 {
0346     data70 = dataF8;
0347     if(p0) dataF8 = memd(ptr_in++#8);
0348     rest = add(rest, #-8);
0349 }:endloop0
0350 .Lkernel:
0351 {
0352     /*  kernel is at least 32bytes  */
0353     p3 = cmp.gtu(kernel, #0);
0354     /*  last itn. remove edge effects  */
0355     if(p3.new) kernel = add(kernel, #-1);
0356     /*  dealt with in last dword loop  */
0357     if(p3.new) epilog = add(epilog, #32);
0358 }
0359 {
0360     nokernel = cmp.eq(kernel, #0);      /*  after adjustment, recheck */
0361     if(nokernel.new) jump:NT .Lepilog;  /*  likely not taken  */
0362     inc = combine(#32, #-1);
0363     p3 = cmp.gtu(dalign, #24);
0364 }
0365 {
0366     if(p3) jump .Lodd_alignment;
0367 }
0368 {
0369     loop0(.Loword_loop_25to31, kernel);
0370     kernel1 = cmp.gtu(kernel, #1);
0371     rest = kernel;
0372 }
0373     .falign
0374 .Loword_loop_25to31:
0375 {
0376     dcfetch(ptr_in_p_128);  /*  prefetch 4 lines ahead  */
0377     if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
0378 }
0379 {
0380     dczeroa(ptr_out_p_32);  /*  reserve the next 32bytes in cache  */
0381     p3 = cmp.eq(kernel, rest);
0382 }
0383 {
0384     /*  kernel -= 1  */
0385     ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
0386     /*  kill write on first iteration  */
0387     if(!p3) memd(ptr_out++#8) = ldata1;
0388     ldata1 = valignb(dataF8, data70, align);
0389     data70 = memd(ptr_in++#8);
0390 }
0391 {
0392     memd(ptr_out++#8) = ldata0;
0393     ldata0 = valignb(data70, dataF8, align);
0394     dataF8 = memd(ptr_in++#8);
0395 }
0396 {
0397     memd(ptr_out++#8) = ldata1;
0398     ldata1 = valignb(dataF8, data70, align);
0399     data70 = memd(ptr_in++#8);
0400 }
0401 {
0402     memd(ptr_out++#8) = ldata0;
0403     ldata0 = valignb(data70, dataF8, align);
0404     dataF8 = memd(ptr_in++#8);
0405     kernel1 = cmp.gtu(kernel, #1);
0406 }:endloop0
0407 {
0408     memd(ptr_out++#8) = ldata1;
0409     jump .Lepilog;
0410 }
0411 .Lodd_alignment:
0412 {
0413     loop0(.Loword_loop_00to24, kernel);
0414     kernel1 = cmp.gtu(kernel, #1);
0415     rest = add(kernel, #-1);
0416 }
0417     .falign
0418 .Loword_loop_00to24:
0419 {
0420     dcfetch(ptr_in_p_128);  /*  prefetch 4 lines ahead  */
0421     ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
0422     if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
0423 }
0424 {
0425     dczeroa(ptr_out_p_32);  /*  reserve the next 32bytes in cache  */
0426 }
0427 {
0428     memd(ptr_out++#8) = ldata0;
0429     ldata0 = valignb(dataF8, data70, align);
0430     data70 = memd(ptr_in++#8);
0431 }
0432 {
0433     memd(ptr_out++#8) = ldata0;
0434     ldata0 = valignb(data70, dataF8, align);
0435     dataF8 = memd(ptr_in++#8);
0436 }
0437 {
0438     memd(ptr_out++#8) = ldata0;
0439     ldata0 = valignb(dataF8, data70, align);
0440     data70 = memd(ptr_in++#8);
0441 }
0442 {
0443     memd(ptr_out++#8) = ldata0;
0444     ldata0 = valignb(data70, dataF8, align);
0445     dataF8 = memd(ptr_in++#8);
0446     kernel1 = cmp.gtu(kernel, #1);
0447 }:endloop0
0448 .Lepilog:
0449 {
0450     noepilog = cmp.eq(epilog,#0);
0451     epilogdws = lsr(epilog, #3);
0452     kernel = and(epilog, #7);
0453 }
0454 {
0455     if(noepilog) jumpr r31;
0456     if(noepilog) ptr_out = sub(ptr_out, len);
0457     p3 = cmp.eq(epilogdws, #0);
0458     shift2 = asl(epilog, #3);
0459 }
0460 {
0461     shiftb = and(shift2, #32);
0462     ifword = tstbit(epilog,#2);
0463     if(p3) jump .Lepilog60;
0464     if(!p3) epilog = add(epilog, #-16);
0465 }
0466 {
0467     loop0(.Ldword_loop_epilog, epilogdws);
0468     /*  stop criteria is lsbs unless = 0 then its 8  */
0469     p3 = cmp.eq(kernel, #0);
0470     if(p3.new) kernel= #8;
0471     p1 = cmp.gt(over, #0);
0472 }
0473     /*  if not aligned to end of buffer execute 1 more iteration  */
0474     if(p1) kernel= #0;
0475 .Ldword_loop_epilog:
0476 {
0477     memd(ptr_out++#8) = ldata0;
0478     ldata0 = valignb(dataF8, data70, align);
0479     p3 = cmp.gt(epilog, kernel);
0480 }
0481 {
0482     data70 = dataF8;
0483     if(p3) dataF8 = memd(ptr_in++#8);
0484     epilog = add(epilog, #-8);
0485 }:endloop0
0486 /* copy last 7 bytes */
0487 .Lepilog60:
0488 {
0489     if(ifword) memw(ptr_out++#4) = data0;
0490     ldata0 = lsr(ldata0, shiftb);
0491     ifhword = tstbit(epilog,#1);
0492     shiftb = and(shift2, #16);
0493 }
0494 {
0495     if(ifhword) memh(ptr_out++#2) = data0;
0496     ldata0 = lsr(ldata0, shiftb);
0497     ifbyte = tstbit(epilog,#0);
0498     if(ifbyte.new) len = add(len, #-1);
0499 }
0500 {
0501     if(ifbyte) memb(ptr_out) = data0;
0502     ptr_out = sub(ptr_out, len);    /*  return dest pointer  */
0503         jumpr r31;
0504 }
0505 /*  do byte copy for small n  */
0506 .Lbytes23orless:
0507 {
0508     p3 = sp1loop0(.Lbyte_copy, len);
0509     len = add(len, #-1);
0510 }
0511 .Lbyte_copy:
0512 {
0513     data0 = memb(ptr_in++#1);
0514     if(p3) memb(ptr_out++#1) = data0;
0515 }:endloop0
0516 {
0517     memb(ptr_out) = data0;
0518     ptr_out = sub(ptr_out, len);
0519     jumpr r31;
0520 }
0521 /*  do dword copies for aligned in, out and length  */
0522 .Ldwordaligned:
0523 {
0524     p3 = sp1loop0(.Ldword_copy, len8);
0525 }
0526 .Ldword_copy:
0527 {
0528     if(p3) memd(ptr_out++#8) = ldata0;
0529     ldata0 = memd(ptr_in++#8);
0530 }:endloop0
0531 {
0532     memd(ptr_out) = ldata0;
0533     ptr_out = sub(ptr_out, len);
0534     jumpr r31;  /*  return to function caller  */
0535 }
0536 .Lmemcpy_return:
0537     r21:20 = memd(sp+#16);  /*  restore r20+r21  */
0538 {
0539     r25:24 = memd(sp+#8);   /*  restore r24+r25  */
0540     r17:16 = memd(sp+#0);   /*  restore r16+r17  */
0541 }
0542     deallocframe;   /*  restore r31 and incrment stack by 16  */
0543     jumpr r31