mips/lib/csum_partial.S

0001 /*
0002  * This file is subject to the terms and conditions of the GNU General Public
0003  * License.  See the file "COPYING" in the main directory of this archive
0004  * for more details.
0005  *
0006  * Quick'n'dirty IP checksum ...
0007  *
0008  * Copyright (C) 1998, 1999 Ralf Baechle
0009  * Copyright (C) 1999 Silicon Graphics, Inc.
0010  * Copyright (C) 2007  Maciej W. Rozycki
0011  * Copyright (C) 2014 Imagination Technologies Ltd.
0012  */
0013 #include <linux/errno.h>
0014 #include <asm/asm.h>
0015 #include <asm/asm-offsets.h>
0016 #include <asm/export.h>
0017 #include <asm/regdef.h>
0018
0019 #ifdef CONFIG_64BIT
0020 /*
0021  * As we are sharing code base with the mips32 tree (which use the o32 ABI
0022  * register definitions). We need to redefine the register definitions from
0023  * the n64 ABI register naming to the o32 ABI register naming.
0024  */
0025 #undef t0
0026 #undef t1
0027 #undef t2
0028 #undef t3
0029 #define t0  $8
0030 #define t1  $9
0031 #define t2  $10
0032 #define t3  $11
0033 #define t4  $12
0034 #define t5  $13
0035 #define t6  $14
0036 #define t7  $15
0037
0038 #define USE_DOUBLE
0039 #endif
0040
0041 #ifdef USE_DOUBLE
0042
0043 #define LOAD   ld
0044 #define LOAD32 lwu
0045 #define ADD    daddu
0046 #define NBYTES 8
0047
0048 #else
0049
0050 #define LOAD   lw
0051 #define LOAD32 lw
0052 #define ADD    addu
0053 #define NBYTES 4
0054
0055 #endif /* USE_DOUBLE */
0056
0057 #define UNIT(unit)  ((unit)*NBYTES)
0058
0059 #define ADDC(sum,reg)                       \
0060     .set    push;                       \
0061     .set    noat;                       \
0062     ADD sum, reg;                   \
0063     sltu    v1, sum, reg;                   \
0064     ADD sum, v1;                    \
0065     .set    pop
0066
0067 #define ADDC32(sum,reg)                     \
0068     .set    push;                       \
0069     .set    noat;                       \
0070     addu    sum, reg;                   \
0071     sltu    v1, sum, reg;                   \
0072     addu    sum, v1;                    \
0073     .set    pop
0074
0075 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
0076     LOAD    _t0, (offset + UNIT(0))(src);           \
0077     LOAD    _t1, (offset + UNIT(1))(src);           \
0078     LOAD    _t2, (offset + UNIT(2))(src);           \
0079     LOAD    _t3, (offset + UNIT(3))(src);           \
0080     ADDC(_t0, _t1);                     \
0081     ADDC(_t2, _t3);                     \
0082     ADDC(sum, _t0);                     \
0083     ADDC(sum, _t2)
0084
0085 #ifdef USE_DOUBLE
0086 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
0087     CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
0088 #else
0089 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
0090     CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
0091     CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
0092 #endif
0093
0094 /*
0095  * a0: source address
0096  * a1: length of the area to checksum
0097  * a2: partial checksum
0098  */
0099
0100 #define src a0
0101 #define sum v0
0102
0103     .text
0104     .set    noreorder
0105     .align  5
0106 LEAF(csum_partial)
0107 EXPORT_SYMBOL(csum_partial)
0108     move    sum, zero
0109     move    t7, zero
0110
0111     sltiu   t8, a1, 0x8
0112     bnez    t8, .Lsmall_csumcpy     /* < 8 bytes to copy */
0113      move   t2, a1
0114
0115     andi    t7, src, 0x1            /* odd buffer? */
0116
0117 .Lhword_align:
0118     beqz    t7, .Lword_align
0119      andi   t8, src, 0x2
0120
0121     lbu t0, (src)
0122     LONG_SUBU   a1, a1, 0x1
0123 #ifdef __MIPSEL__
0124     sll t0, t0, 8
0125 #endif
0126     ADDC(sum, t0)
0127     PTR_ADDU    src, src, 0x1
0128     andi    t8, src, 0x2
0129
0130 .Lword_align:
0131     beqz    t8, .Ldword_align
0132      sltiu  t8, a1, 56
0133
0134     lhu t0, (src)
0135     LONG_SUBU   a1, a1, 0x2
0136     ADDC(sum, t0)
0137     sltiu   t8, a1, 56
0138     PTR_ADDU    src, src, 0x2
0139
0140 .Ldword_align:
0141     bnez    t8, .Ldo_end_words
0142      move   t8, a1
0143
0144     andi    t8, src, 0x4
0145     beqz    t8, .Lqword_align
0146      andi   t8, src, 0x8
0147
0148     LOAD32  t0, 0x00(src)
0149     LONG_SUBU   a1, a1, 0x4
0150     ADDC(sum, t0)
0151     PTR_ADDU    src, src, 0x4
0152     andi    t8, src, 0x8
0153
0154 .Lqword_align:
0155     beqz    t8, .Loword_align
0156      andi   t8, src, 0x10
0157
0158 #ifdef USE_DOUBLE
0159     ld  t0, 0x00(src)
0160     LONG_SUBU   a1, a1, 0x8
0161     ADDC(sum, t0)
0162 #else
0163     lw  t0, 0x00(src)
0164     lw  t1, 0x04(src)
0165     LONG_SUBU   a1, a1, 0x8
0166     ADDC(sum, t0)
0167     ADDC(sum, t1)
0168 #endif
0169     PTR_ADDU    src, src, 0x8
0170     andi    t8, src, 0x10
0171
0172 .Loword_align:
0173     beqz    t8, .Lbegin_movement
0174      LONG_SRL   t8, a1, 0x7
0175
0176 #ifdef USE_DOUBLE
0177     ld  t0, 0x00(src)
0178     ld  t1, 0x08(src)
0179     ADDC(sum, t0)
0180     ADDC(sum, t1)
0181 #else
0182     CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
0183 #endif
0184     LONG_SUBU   a1, a1, 0x10
0185     PTR_ADDU    src, src, 0x10
0186     LONG_SRL    t8, a1, 0x7
0187
0188 .Lbegin_movement:
0189     beqz    t8, 1f
0190      andi   t2, a1, 0x40
0191
0192 .Lmove_128bytes:
0193     CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
0194     CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
0195     CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
0196     CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
0197     LONG_SUBU   t8, t8, 0x01
0198     .set    reorder             /* DADDI_WAR */
0199     PTR_ADDU    src, src, 0x80
0200     bnez    t8, .Lmove_128bytes
0201     .set    noreorder
0202
0203 1:
0204     beqz    t2, 1f
0205      andi   t2, a1, 0x20
0206
0207 .Lmove_64bytes:
0208     CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
0209     CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
0210     PTR_ADDU    src, src, 0x40
0211
0212 1:
0213     beqz    t2, .Ldo_end_words
0214      andi   t8, a1, 0x1c
0215
0216 .Lmove_32bytes:
0217     CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
0218     andi    t8, a1, 0x1c
0219     PTR_ADDU    src, src, 0x20
0220
0221 .Ldo_end_words:
0222     beqz    t8, .Lsmall_csumcpy
0223      andi   t2, a1, 0x3
0224     LONG_SRL    t8, t8, 0x2
0225
0226 .Lend_words:
0227     LOAD32  t0, (src)
0228     LONG_SUBU   t8, t8, 0x1
0229     ADDC(sum, t0)
0230     .set    reorder             /* DADDI_WAR */
0231     PTR_ADDU    src, src, 0x4
0232     bnez    t8, .Lend_words
0233     .set    noreorder
0234
0235 /* unknown src alignment and < 8 bytes to go  */
0236 .Lsmall_csumcpy:
0237     move    a1, t2
0238
0239     andi    t0, a1, 4
0240     beqz    t0, 1f
0241      andi   t0, a1, 2
0242
0243     /* Still a full word to go  */
0244     ulw t1, (src)
0245     PTR_ADDIU   src, 4
0246 #ifdef USE_DOUBLE
0247     dsll    t1, t1, 32          /* clear lower 32bit */
0248 #endif
0249     ADDC(sum, t1)
0250
0251 1:  move    t1, zero
0252     beqz    t0, 1f
0253      andi   t0, a1, 1
0254
0255     /* Still a halfword to go  */
0256     ulhu    t1, (src)
0257     PTR_ADDIU   src, 2
0258
0259 1:  beqz    t0, 1f
0260      sll    t1, t1, 16
0261
0262     lbu t2, (src)
0263      nop
0264
0265 #ifdef __MIPSEB__
0266     sll t2, t2, 8
0267 #endif
0268     or  t1, t2
0269
0270 1:  ADDC(sum, t1)
0271
0272     /* fold checksum */
0273 #ifdef USE_DOUBLE
0274     dsll32  v1, sum, 0
0275     daddu   sum, v1
0276     sltu    v1, sum, v1
0277     dsra32  sum, sum, 0
0278     addu    sum, v1
0279 #endif
0280
0281     /* odd buffer alignment? */
0282 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
0283     defined(CONFIG_CPU_LOONGSON64)
0284     .set    push
0285     .set    arch=mips32r2
0286     wsbh    v1, sum
0287     movn    sum, v1, t7
0288     .set    pop
0289 #else
0290     beqz    t7, 1f          /* odd buffer alignment? */
0291      lui    v1, 0x00ff
0292     addu    v1, 0x00ff
0293     and t0, sum, v1
0294     sll t0, t0, 8
0295     srl sum, sum, 8
0296     and sum, sum, v1
0297     or  sum, sum, t0
0298 1:
0299 #endif
0300     .set    reorder
0301     /* Add the passed partial csum.  */
0302     ADDC32(sum, a2)
0303     jr  ra
0304     .set    noreorder
0305     END(csum_partial)
0306
0307
0308 /*
0309  * checksum and copy routines based on memcpy.S
0310  *
0311  *  csum_partial_copy_nocheck(src, dst, len)
0312  *  __csum_partial_copy_kernel(src, dst, len)
0313  *
0314  * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
0315  * function in this file use the standard calling convention.
0316  */
0317
0318 #define src a0
0319 #define dst a1
0320 #define len a2
0321 #define sum v0
0322 #define odd t8
0323
0324 /*
0325  * All exception handlers simply return 0.
0326  */
0327
0328 /* Instruction type */
0329 #define LD_INSN 1
0330 #define ST_INSN 2
0331 #define LEGACY_MODE 1
0332 #define EVA_MODE    2
0333 #define USEROP   1
0334 #define KERNELOP 2
0335
0336 /*
0337  * Wrapper to add an entry in the exception table
0338  * in case the insn causes a memory exception.
0339  * Arguments:
0340  * insn    : Load/store instruction
0341  * type    : Instruction type
0342  * reg     : Register
0343  * addr    : Address
0344  * handler : Exception handler
0345  */
0346 #define EXC(insn, type, reg, addr)      \
0347     .if \mode == LEGACY_MODE;       \
0348 9:      insn reg, addr;         \
0349         .section __ex_table,"a";    \
0350         PTR_WD  9b, .L_exc;     \
0351         .previous;          \
0352     /* This is enabled in EVA mode */   \
0353     .else;                  \
0354         /* If loading from user or storing to user */   \
0355         .if ((\from == USEROP) && (type == LD_INSN)) || \
0356             ((\to == USEROP) && (type == ST_INSN)); \
0357 9:          __BUILD_EVA_INSN(insn##e, reg, addr);   \
0358             .section __ex_table,"a";        \
0359             PTR_WD  9b, .L_exc;         \
0360             .previous;              \
0361         .else;                      \
0362             /* EVA without exception */     \
0363             insn reg, addr;             \
0364         .endif;                     \
0365     .endif
0366
0367 #undef LOAD
0368
0369 #ifdef USE_DOUBLE
0370
0371 #define LOADK   ld /* No exception */
0372 #define LOAD(reg, addr)     EXC(ld, LD_INSN, reg, addr)
0373 #define LOADBU(reg, addr)   EXC(lbu, LD_INSN, reg, addr)
0374 #define LOADL(reg, addr)    EXC(ldl, LD_INSN, reg, addr)
0375 #define LOADR(reg, addr)    EXC(ldr, LD_INSN, reg, addr)
0376 #define STOREB(reg, addr)   EXC(sb, ST_INSN, reg, addr)
0377 #define STOREL(reg, addr)   EXC(sdl, ST_INSN, reg, addr)
0378 #define STORER(reg, addr)   EXC(sdr, ST_INSN, reg, addr)
0379 #define STORE(reg, addr)    EXC(sd, ST_INSN, reg, addr)
0380 #define ADD    daddu
0381 #define SUB    dsubu
0382 #define SRL    dsrl
0383 #define SLL    dsll
0384 #define SLLV   dsllv
0385 #define SRLV   dsrlv
0386 #define NBYTES 8
0387 #define LOG_NBYTES 3
0388
0389 #else
0390
0391 #define LOADK   lw /* No exception */
0392 #define LOAD(reg, addr)     EXC(lw, LD_INSN, reg, addr)
0393 #define LOADBU(reg, addr)   EXC(lbu, LD_INSN, reg, addr)
0394 #define LOADL(reg, addr)    EXC(lwl, LD_INSN, reg, addr)
0395 #define LOADR(reg, addr)    EXC(lwr, LD_INSN, reg, addr)
0396 #define STOREB(reg, addr)   EXC(sb, ST_INSN, reg, addr)
0397 #define STOREL(reg, addr)   EXC(swl, ST_INSN, reg, addr)
0398 #define STORER(reg, addr)   EXC(swr, ST_INSN, reg, addr)
0399 #define STORE(reg, addr)    EXC(sw, ST_INSN, reg, addr)
0400 #define ADD    addu
0401 #define SUB    subu
0402 #define SRL    srl
0403 #define SLL    sll
0404 #define SLLV   sllv
0405 #define SRLV   srlv
0406 #define NBYTES 4
0407 #define LOG_NBYTES 2
0408
0409 #endif /* USE_DOUBLE */
0410
0411 #ifdef CONFIG_CPU_LITTLE_ENDIAN
0412 #define LDFIRST LOADR
0413 #define LDREST  LOADL
0414 #define STFIRST STORER
0415 #define STREST  STOREL
0416 #define SHIFT_DISCARD SLLV
0417 #define SHIFT_DISCARD_REVERT SRLV
0418 #else
0419 #define LDFIRST LOADL
0420 #define LDREST  LOADR
0421 #define STFIRST STOREL
0422 #define STREST  STORER
0423 #define SHIFT_DISCARD SRLV
0424 #define SHIFT_DISCARD_REVERT SLLV
0425 #endif
0426
0427 #define FIRST(unit) ((unit)*NBYTES)
0428 #define REST(unit)  (FIRST(unit)+NBYTES-1)
0429
0430 #define ADDRMASK (NBYTES-1)
0431
0432 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
0433     .set    noat
0434 #else
0435     .set    at=v1
0436 #endif
0437
0438     .macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to
0439
0440     li  sum, -1
0441     move    odd, zero
0442     /*
0443      * Note: dst & src may be unaligned, len may be 0
0444      * Temps
0445      */
0446     /*
0447      * The "issue break"s below are very approximate.
0448      * Issue delays for dcache fills will perturb the schedule, as will
0449      * load queue full replay traps, etc.
0450      *
0451      * If len < NBYTES use byte operations.
0452      */
0453     sltu    t2, len, NBYTES
0454     and t1, dst, ADDRMASK
0455     bnez    t2, .Lcopy_bytes_checklen\@
0456      and    t0, src, ADDRMASK
0457     andi    odd, dst, 0x1           /* odd buffer? */
0458     bnez    t1, .Ldst_unaligned\@
0459      nop
0460     bnez    t0, .Lsrc_unaligned_dst_aligned\@
0461     /*
0462      * use delay slot for fall-through
0463      * src and dst are aligned; need to compute rem
0464      */
0465 .Lboth_aligned\@:
0466      SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
0467     beqz    t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
0468      nop
0469     SUB len, 8*NBYTES       # subtract here for bgez loop
0470     .align  4
0471 1:
0472     LOAD(t0, UNIT(0)(src))
0473     LOAD(t1, UNIT(1)(src))
0474     LOAD(t2, UNIT(2)(src))
0475     LOAD(t3, UNIT(3)(src))
0476     LOAD(t4, UNIT(4)(src))
0477     LOAD(t5, UNIT(5)(src))
0478     LOAD(t6, UNIT(6)(src))
0479     LOAD(t7, UNIT(7)(src))
0480     SUB len, len, 8*NBYTES
0481     ADD src, src, 8*NBYTES
0482     STORE(t0, UNIT(0)(dst))
0483     ADDC(t0, t1)
0484     STORE(t1, UNIT(1)(dst))
0485     ADDC(sum, t0)
0486     STORE(t2, UNIT(2)(dst))
0487     ADDC(t2, t3)
0488     STORE(t3, UNIT(3)(dst))
0489     ADDC(sum, t2)
0490     STORE(t4, UNIT(4)(dst))
0491     ADDC(t4, t5)
0492     STORE(t5, UNIT(5)(dst))
0493     ADDC(sum, t4)
0494     STORE(t6, UNIT(6)(dst))
0495     ADDC(t6, t7)
0496     STORE(t7, UNIT(7)(dst))
0497     ADDC(sum, t6)
0498     .set    reorder             /* DADDI_WAR */
0499     ADD dst, dst, 8*NBYTES
0500     bgez    len, 1b
0501     .set    noreorder
0502     ADD len, 8*NBYTES       # revert len (see above)
0503
0504     /*
0505      * len == the number of bytes left to copy < 8*NBYTES
0506      */
0507 .Lcleanup_both_aligned\@:
0508 #define rem t7
0509     beqz    len, .Ldone\@
0510      sltu   t0, len, 4*NBYTES
0511     bnez    t0, .Lless_than_4units\@
0512      and    rem, len, (NBYTES-1)    # rem = len % NBYTES
0513     /*
0514      * len >= 4*NBYTES
0515      */
0516     LOAD(t0, UNIT(0)(src))
0517     LOAD(t1, UNIT(1)(src))
0518     LOAD(t2, UNIT(2)(src))
0519     LOAD(t3, UNIT(3)(src))
0520     SUB len, len, 4*NBYTES
0521     ADD src, src, 4*NBYTES
0522     STORE(t0, UNIT(0)(dst))
0523     ADDC(t0, t1)
0524     STORE(t1, UNIT(1)(dst))
0525     ADDC(sum, t0)
0526     STORE(t2, UNIT(2)(dst))
0527     ADDC(t2, t3)
0528     STORE(t3, UNIT(3)(dst))
0529     ADDC(sum, t2)
0530     .set    reorder             /* DADDI_WAR */
0531     ADD dst, dst, 4*NBYTES
0532     beqz    len, .Ldone\@
0533     .set    noreorder
0534 .Lless_than_4units\@:
0535     /*
0536      * rem = len % NBYTES
0537      */
0538     beq rem, len, .Lcopy_bytes\@
0539      nop
0540 1:
0541     LOAD(t0, 0(src))
0542     ADD src, src, NBYTES
0543     SUB len, len, NBYTES
0544     STORE(t0, 0(dst))
0545     ADDC(sum, t0)
0546     .set    reorder             /* DADDI_WAR */
0547     ADD dst, dst, NBYTES
0548     bne rem, len, 1b
0549     .set    noreorder
0550
0551     /*
0552      * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
0553      * A loop would do only a byte at a time with possible branch
0554      * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
0555      * because can't assume read-access to dst.  Instead, use
0556      * STREST dst, which doesn't require read access to dst.
0557      *
0558      * This code should perform better than a simple loop on modern,
0559      * wide-issue mips processors because the code has fewer branches and
0560      * more instruction-level parallelism.
0561      */
0562 #define bits t2
0563     beqz    len, .Ldone\@
0564      ADD    t1, dst, len    # t1 is just past last byte of dst
0565     li  bits, 8*NBYTES
0566     SLL rem, len, 3 # rem = number of bits to keep
0567     LOAD(t0, 0(src))
0568     SUB bits, bits, rem # bits = number of bits to discard
0569     SHIFT_DISCARD t0, t0, bits
0570     STREST(t0, -1(t1))
0571     SHIFT_DISCARD_REVERT t0, t0, bits
0572     .set reorder
0573     ADDC(sum, t0)
0574     b   .Ldone\@
0575     .set noreorder
0576 .Ldst_unaligned\@:
0577     /*
0578      * dst is unaligned
0579      * t0 = src & ADDRMASK
0580      * t1 = dst & ADDRMASK; T1 > 0
0581      * len >= NBYTES
0582      *
0583      * Copy enough bytes to align dst
0584      * Set match = (src and dst have same alignment)
0585      */
0586 #define match rem
0587     LDFIRST(t3, FIRST(0)(src))
0588     ADD t2, zero, NBYTES
0589     LDREST(t3, REST(0)(src))
0590     SUB t2, t2, t1  # t2 = number of bytes copied
0591     xor match, t0, t1
0592     STFIRST(t3, FIRST(0)(dst))
0593     SLL t4, t1, 3       # t4 = number of bits to discard
0594     SHIFT_DISCARD t3, t3, t4
0595     /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
0596     ADDC(sum, t3)
0597     beq len, t2, .Ldone\@
0598      SUB    len, len, t2
0599     ADD dst, dst, t2
0600     beqz    match, .Lboth_aligned\@
0601      ADD    src, src, t2
0602
0603 .Lsrc_unaligned_dst_aligned\@:
0604     SRL t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
0605     beqz    t0, .Lcleanup_src_unaligned\@
0606      and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
0607 1:
0608 /*
0609  * Avoid consecutive LD*'s to the same register since some mips
0610  * implementations can't issue them in the same cycle.
0611  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
0612  * are to the same unit (unless src is aligned, but it's not).
0613  */
0614     LDFIRST(t0, FIRST(0)(src))
0615     LDFIRST(t1, FIRST(1)(src))
0616     SUB len, len, 4*NBYTES
0617     LDREST(t0, REST(0)(src))
0618     LDREST(t1, REST(1)(src))
0619     LDFIRST(t2, FIRST(2)(src))
0620     LDFIRST(t3, FIRST(3)(src))
0621     LDREST(t2, REST(2)(src))
0622     LDREST(t3, REST(3)(src))
0623     ADD src, src, 4*NBYTES
0624 #ifdef CONFIG_CPU_SB1
0625     nop             # improves slotting
0626 #endif
0627     STORE(t0, UNIT(0)(dst))
0628     ADDC(t0, t1)
0629     STORE(t1, UNIT(1)(dst))
0630     ADDC(sum, t0)
0631     STORE(t2, UNIT(2)(dst))
0632     ADDC(t2, t3)
0633     STORE(t3, UNIT(3)(dst))
0634     ADDC(sum, t2)
0635     .set    reorder             /* DADDI_WAR */
0636     ADD dst, dst, 4*NBYTES
0637     bne len, rem, 1b
0638     .set    noreorder
0639
0640 .Lcleanup_src_unaligned\@:
0641     beqz    len, .Ldone\@
0642      and    rem, len, NBYTES-1  # rem = len % NBYTES
0643     beq rem, len, .Lcopy_bytes\@
0644      nop
0645 1:
0646     LDFIRST(t0, FIRST(0)(src))
0647     LDREST(t0, REST(0)(src))
0648     ADD src, src, NBYTES
0649     SUB len, len, NBYTES
0650     STORE(t0, 0(dst))
0651     ADDC(sum, t0)
0652     .set    reorder             /* DADDI_WAR */
0653     ADD dst, dst, NBYTES
0654     bne len, rem, 1b
0655     .set    noreorder
0656
0657 .Lcopy_bytes_checklen\@:
0658     beqz    len, .Ldone\@
0659      nop
0660 .Lcopy_bytes\@:
0661     /* 0 < len < NBYTES  */
0662 #ifdef CONFIG_CPU_LITTLE_ENDIAN
0663 #define SHIFT_START 0
0664 #define SHIFT_INC 8
0665 #else
0666 #define SHIFT_START 8*(NBYTES-1)
0667 #define SHIFT_INC -8
0668 #endif
0669     move    t2, zero    # partial word
0670     li  t3, SHIFT_START # shift
0671 #define COPY_BYTE(N)            \
0672     LOADBU(t0, N(src));     \
0673     SUB len, len, 1;        \
0674     STOREB(t0, N(dst));     \
0675     SLLV    t0, t0, t3;     \
0676     addu    t3, SHIFT_INC;      \
0677     beqz    len, .Lcopy_bytes_done\@; \
0678      or t2, t0
0679
0680     COPY_BYTE(0)
0681     COPY_BYTE(1)
0682 #ifdef USE_DOUBLE
0683     COPY_BYTE(2)
0684     COPY_BYTE(3)
0685     COPY_BYTE(4)
0686     COPY_BYTE(5)
0687 #endif
0688     LOADBU(t0, NBYTES-2(src))
0689     SUB len, len, 1
0690     STOREB(t0, NBYTES-2(dst))
0691     SLLV    t0, t0, t3
0692     or  t2, t0
0693 .Lcopy_bytes_done\@:
0694     ADDC(sum, t2)
0695 .Ldone\@:
0696     /* fold checksum */
0697     .set    push
0698     .set    noat
0699 #ifdef USE_DOUBLE
0700     dsll32  v1, sum, 0
0701     daddu   sum, v1
0702     sltu    v1, sum, v1
0703     dsra32  sum, sum, 0
0704     addu    sum, v1
0705 #endif
0706
0707 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5) || \
0708     defined(CONFIG_CPU_LOONGSON64)
0709     .set    push
0710     .set    arch=mips32r2
0711     wsbh    v1, sum
0712     movn    sum, v1, odd
0713     .set    pop
0714 #else
0715     beqz    odd, 1f         /* odd buffer alignment? */
0716      lui    v1, 0x00ff
0717     addu    v1, 0x00ff
0718     and t0, sum, v1
0719     sll t0, t0, 8
0720     srl sum, sum, 8
0721     and sum, sum, v1
0722     or  sum, sum, t0
0723 1:
0724 #endif
0725     .set    pop
0726     .set reorder
0727     jr  ra
0728     .set noreorder
0729     .endm
0730
0731     .set noreorder
0732 .L_exc:
0733     jr  ra
0734      li v0, 0
0735
0736 FEXPORT(__csum_partial_copy_nocheck)
0737 EXPORT_SYMBOL(__csum_partial_copy_nocheck)
0738 #ifndef CONFIG_EVA
0739 FEXPORT(__csum_partial_copy_to_user)
0740 EXPORT_SYMBOL(__csum_partial_copy_to_user)
0741 FEXPORT(__csum_partial_copy_from_user)
0742 EXPORT_SYMBOL(__csum_partial_copy_from_user)
0743 #endif
0744 __BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP
0745
0746 #ifdef CONFIG_EVA
0747 LEAF(__csum_partial_copy_to_user)
0748 __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP
0749 END(__csum_partial_copy_to_user)
0750
0751 LEAF(__csum_partial_copy_from_user)
0752 __BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP
0753 END(__csum_partial_copy_from_user)
0754 #endif