Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
0002 /* Copyright (C) 2016-2018 Netronome Systems, Inc. */
0003 
0004 #define pr_fmt(fmt) "NFP net bpf: " fmt
0005 
0006 #include <linux/bug.h>
0007 #include <linux/bpf.h>
0008 #include <linux/filter.h>
0009 #include <linux/kernel.h>
0010 #include <linux/pkt_cls.h>
0011 #include <linux/reciprocal_div.h>
0012 #include <linux/unistd.h>
0013 
0014 #include "main.h"
0015 #include "../nfp_asm.h"
0016 #include "../nfp_net_ctrl.h"
0017 
0018 /* --- NFP prog --- */
0019 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
0020  * It's safe to modify the next pointers (but not pos).
0021  */
0022 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)            \
0023     for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
0024          next = list_next_entry(pos, l);            \
0025          &(nfp_prog)->insns != &pos->l &&           \
0026          &(nfp_prog)->insns != &next->l;            \
0027          pos = nfp_meta_next(pos),              \
0028          next = nfp_meta_next(pos))
0029 
0030 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)     \
0031     for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
0032          next = list_next_entry(pos, l),            \
0033          next2 = list_next_entry(next, l);          \
0034          &(nfp_prog)->insns != &pos->l &&           \
0035          &(nfp_prog)->insns != &next->l &&          \
0036          &(nfp_prog)->insns != &next2->l;           \
0037          pos = nfp_meta_next(pos),              \
0038          next = nfp_meta_next(pos),             \
0039          next2 = nfp_meta_next(next))
0040 
0041 static bool
0042 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
0043 {
0044     return meta->l.prev != &nfp_prog->insns;
0045 }
0046 
0047 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
0048 {
0049     if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
0050         pr_warn("instruction limit reached (%u NFP instructions)\n",
0051             nfp_prog->prog_len);
0052         nfp_prog->error = -ENOSPC;
0053         return;
0054     }
0055 
0056     nfp_prog->prog[nfp_prog->prog_len] = insn;
0057     nfp_prog->prog_len++;
0058 }
0059 
0060 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
0061 {
0062     return nfp_prog->prog_len;
0063 }
0064 
0065 static bool
0066 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
0067 {
0068     /* If there is a recorded error we may have dropped instructions;
0069      * that doesn't have to be due to translator bug, and the translation
0070      * will fail anyway, so just return OK.
0071      */
0072     if (nfp_prog->error)
0073         return true;
0074     return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
0075 }
0076 
0077 /* --- Emitters --- */
0078 static void
0079 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
0080        u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
0081        bool indir)
0082 {
0083     u64 insn;
0084 
0085     insn =  FIELD_PREP(OP_CMD_A_SRC, areg) |
0086         FIELD_PREP(OP_CMD_CTX, ctx) |
0087         FIELD_PREP(OP_CMD_B_SRC, breg) |
0088         FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
0089         FIELD_PREP(OP_CMD_XFER, xfer) |
0090         FIELD_PREP(OP_CMD_CNT, size) |
0091         FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
0092         FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
0093         FIELD_PREP(OP_CMD_INDIR, indir) |
0094         FIELD_PREP(OP_CMD_MODE, mode);
0095 
0096     nfp_prog_push(nfp_prog, insn);
0097 }
0098 
0099 static void
0100 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
0101          swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
0102 {
0103     struct nfp_insn_re_regs reg;
0104     int err;
0105 
0106     err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
0107     if (err) {
0108         nfp_prog->error = err;
0109         return;
0110     }
0111     if (reg.swap) {
0112         pr_err("cmd can't swap arguments\n");
0113         nfp_prog->error = -EFAULT;
0114         return;
0115     }
0116     if (reg.dst_lmextn || reg.src_lmextn) {
0117         pr_err("cmd can't use LMextn\n");
0118         nfp_prog->error = -EFAULT;
0119         return;
0120     }
0121 
0122     __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
0123            indir);
0124 }
0125 
0126 static void
0127 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
0128      swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
0129 {
0130     emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
0131 }
0132 
0133 static void
0134 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
0135            swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
0136 {
0137     emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
0138 }
0139 
0140 static void
0141 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
0142       enum br_ctx_signal_state css, u16 addr, u8 defer)
0143 {
0144     u16 addr_lo, addr_hi;
0145     u64 insn;
0146 
0147     addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
0148     addr_hi = addr != addr_lo;
0149 
0150     insn = OP_BR_BASE |
0151         FIELD_PREP(OP_BR_MASK, mask) |
0152         FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
0153         FIELD_PREP(OP_BR_CSS, css) |
0154         FIELD_PREP(OP_BR_DEFBR, defer) |
0155         FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
0156         FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
0157 
0158     nfp_prog_push(nfp_prog, insn);
0159 }
0160 
0161 static void
0162 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
0163          enum nfp_relo_type relo)
0164 {
0165     if (mask == BR_UNC && defer > 2) {
0166         pr_err("BUG: branch defer out of bounds %d\n", defer);
0167         nfp_prog->error = -EFAULT;
0168         return;
0169     }
0170 
0171     __emit_br(nfp_prog, mask,
0172           mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
0173           BR_CSS_NONE, addr, defer);
0174 
0175     nfp_prog->prog[nfp_prog->prog_len - 1] |=
0176         FIELD_PREP(OP_RELO_TYPE, relo);
0177 }
0178 
0179 static void
0180 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
0181 {
0182     emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
0183 }
0184 
0185 static void
0186 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
0187           bool set, bool src_lmextn)
0188 {
0189     u16 addr_lo, addr_hi;
0190     u64 insn;
0191 
0192     addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
0193     addr_hi = addr != addr_lo;
0194 
0195     insn = OP_BR_BIT_BASE |
0196         FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
0197         FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
0198         FIELD_PREP(OP_BR_BIT_BV, set) |
0199         FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
0200         FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
0201         FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
0202         FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
0203 
0204     nfp_prog_push(nfp_prog, insn);
0205 }
0206 
0207 static void
0208 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
0209          u8 defer, bool set, enum nfp_relo_type relo)
0210 {
0211     struct nfp_insn_re_regs reg;
0212     int err;
0213 
0214     /* NOTE: The bit to test is specified as an rotation amount, such that
0215      *   the bit to test will be placed on the MSB of the result when
0216      *   doing a rotate right. For bit X, we need right rotate X + 1.
0217      */
0218     bit += 1;
0219 
0220     err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
0221     if (err) {
0222         nfp_prog->error = err;
0223         return;
0224     }
0225 
0226     __emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
0227               reg.src_lmextn);
0228 
0229     nfp_prog->prog[nfp_prog->prog_len - 1] |=
0230         FIELD_PREP(OP_RELO_TYPE, relo);
0231 }
0232 
0233 static void
0234 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
0235 {
0236     emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
0237 }
0238 
0239 static void
0240 __emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
0241           u8 defer, bool dst_lmextn, bool src_lmextn)
0242 {
0243     u64 insn;
0244 
0245     insn = OP_BR_ALU_BASE |
0246         FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
0247         FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
0248         FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
0249         FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
0250         FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
0251         FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
0252 
0253     nfp_prog_push(nfp_prog, insn);
0254 }
0255 
0256 static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
0257 {
0258     struct nfp_insn_ur_regs reg;
0259     int err;
0260 
0261     err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
0262     if (err) {
0263         nfp_prog->error = err;
0264         return;
0265     }
0266 
0267     __emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
0268               reg.src_lmextn);
0269 }
0270 
0271 static void
0272 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
0273          enum immed_width width, bool invert,
0274          enum immed_shift shift, bool wr_both,
0275          bool dst_lmextn, bool src_lmextn)
0276 {
0277     u64 insn;
0278 
0279     insn = OP_IMMED_BASE |
0280         FIELD_PREP(OP_IMMED_A_SRC, areg) |
0281         FIELD_PREP(OP_IMMED_B_SRC, breg) |
0282         FIELD_PREP(OP_IMMED_IMM, imm_hi) |
0283         FIELD_PREP(OP_IMMED_WIDTH, width) |
0284         FIELD_PREP(OP_IMMED_INV, invert) |
0285         FIELD_PREP(OP_IMMED_SHIFT, shift) |
0286         FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
0287         FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
0288         FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
0289 
0290     nfp_prog_push(nfp_prog, insn);
0291 }
0292 
0293 static void
0294 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
0295        enum immed_width width, bool invert, enum immed_shift shift)
0296 {
0297     struct nfp_insn_ur_regs reg;
0298     int err;
0299 
0300     if (swreg_type(dst) == NN_REG_IMM) {
0301         nfp_prog->error = -EFAULT;
0302         return;
0303     }
0304 
0305     err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
0306     if (err) {
0307         nfp_prog->error = err;
0308         return;
0309     }
0310 
0311     /* Use reg.dst when destination is No-Dest. */
0312     __emit_immed(nfp_prog,
0313              swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
0314              reg.breg, imm >> 8, width, invert, shift,
0315              reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
0316 }
0317 
0318 static void
0319 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
0320        enum shf_sc sc, u8 shift,
0321        u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
0322        bool dst_lmextn, bool src_lmextn)
0323 {
0324     u64 insn;
0325 
0326     if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
0327         nfp_prog->error = -EFAULT;
0328         return;
0329     }
0330 
0331     /* NFP shift instruction has something special. If shift direction is
0332      * left then shift amount of 1 to 31 is specified as 32 minus the amount
0333      * to shift.
0334      *
0335      * But no need to do this for indirect shift which has shift amount be
0336      * 0. Even after we do this subtraction, shift amount 0 will be turned
0337      * into 32 which will eventually be encoded the same as 0 because only
0338      * low 5 bits are encoded, but shift amount be 32 will fail the
0339      * FIELD_PREP check done later on shift mask (0x1f), due to 32 is out of
0340      * mask range.
0341      */
0342     if (sc == SHF_SC_L_SHF && shift)
0343         shift = 32 - shift;
0344 
0345     insn = OP_SHF_BASE |
0346         FIELD_PREP(OP_SHF_A_SRC, areg) |
0347         FIELD_PREP(OP_SHF_SC, sc) |
0348         FIELD_PREP(OP_SHF_B_SRC, breg) |
0349         FIELD_PREP(OP_SHF_I8, i8) |
0350         FIELD_PREP(OP_SHF_SW, sw) |
0351         FIELD_PREP(OP_SHF_DST, dst) |
0352         FIELD_PREP(OP_SHF_SHIFT, shift) |
0353         FIELD_PREP(OP_SHF_OP, op) |
0354         FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
0355         FIELD_PREP(OP_SHF_WR_AB, wr_both) |
0356         FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
0357         FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
0358 
0359     nfp_prog_push(nfp_prog, insn);
0360 }
0361 
0362 static void
0363 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
0364      swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
0365 {
0366     struct nfp_insn_re_regs reg;
0367     int err;
0368 
0369     err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
0370     if (err) {
0371         nfp_prog->error = err;
0372         return;
0373     }
0374 
0375     __emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
0376            reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
0377            reg.dst_lmextn, reg.src_lmextn);
0378 }
0379 
0380 static void
0381 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
0382            swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
0383 {
0384     if (sc == SHF_SC_R_ROT) {
0385         pr_err("indirect shift is not allowed on rotation\n");
0386         nfp_prog->error = -EFAULT;
0387         return;
0388     }
0389 
0390     emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
0391 }
0392 
0393 static void
0394 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
0395        u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
0396        bool dst_lmextn, bool src_lmextn)
0397 {
0398     u64 insn;
0399 
0400     insn = OP_ALU_BASE |
0401         FIELD_PREP(OP_ALU_A_SRC, areg) |
0402         FIELD_PREP(OP_ALU_B_SRC, breg) |
0403         FIELD_PREP(OP_ALU_DST, dst) |
0404         FIELD_PREP(OP_ALU_SW, swap) |
0405         FIELD_PREP(OP_ALU_OP, op) |
0406         FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
0407         FIELD_PREP(OP_ALU_WR_AB, wr_both) |
0408         FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
0409         FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
0410 
0411     nfp_prog_push(nfp_prog, insn);
0412 }
0413 
0414 static void
0415 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
0416      swreg lreg, enum alu_op op, swreg rreg)
0417 {
0418     struct nfp_insn_ur_regs reg;
0419     int err;
0420 
0421     err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
0422     if (err) {
0423         nfp_prog->error = err;
0424         return;
0425     }
0426 
0427     __emit_alu(nfp_prog, reg.dst, reg.dst_ab,
0428            reg.areg, op, reg.breg, reg.swap, reg.wr_both,
0429            reg.dst_lmextn, reg.src_lmextn);
0430 }
0431 
0432 static void
0433 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
0434        enum mul_type type, enum mul_step step, u16 breg, bool swap,
0435        bool wr_both, bool dst_lmextn, bool src_lmextn)
0436 {
0437     u64 insn;
0438 
0439     insn = OP_MUL_BASE |
0440         FIELD_PREP(OP_MUL_A_SRC, areg) |
0441         FIELD_PREP(OP_MUL_B_SRC, breg) |
0442         FIELD_PREP(OP_MUL_STEP, step) |
0443         FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
0444         FIELD_PREP(OP_MUL_SW, swap) |
0445         FIELD_PREP(OP_MUL_TYPE, type) |
0446         FIELD_PREP(OP_MUL_WR_AB, wr_both) |
0447         FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
0448         FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
0449 
0450     nfp_prog_push(nfp_prog, insn);
0451 }
0452 
0453 static void
0454 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
0455      enum mul_step step, swreg rreg)
0456 {
0457     struct nfp_insn_ur_regs reg;
0458     u16 areg;
0459     int err;
0460 
0461     if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
0462         nfp_prog->error = -EINVAL;
0463         return;
0464     }
0465 
0466     if (step == MUL_LAST || step == MUL_LAST_2) {
0467         /* When type is step and step Number is LAST or LAST2, left
0468          * source is used as destination.
0469          */
0470         err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
0471         areg = reg.dst;
0472     } else {
0473         err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
0474         areg = reg.areg;
0475     }
0476 
0477     if (err) {
0478         nfp_prog->error = err;
0479         return;
0480     }
0481 
0482     __emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
0483            reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
0484 }
0485 
0486 static void
0487 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
0488         u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
0489         bool zero, bool swap, bool wr_both,
0490         bool dst_lmextn, bool src_lmextn)
0491 {
0492     u64 insn;
0493 
0494     insn = OP_LDF_BASE |
0495         FIELD_PREP(OP_LDF_A_SRC, areg) |
0496         FIELD_PREP(OP_LDF_SC, sc) |
0497         FIELD_PREP(OP_LDF_B_SRC, breg) |
0498         FIELD_PREP(OP_LDF_I8, imm8) |
0499         FIELD_PREP(OP_LDF_SW, swap) |
0500         FIELD_PREP(OP_LDF_ZF, zero) |
0501         FIELD_PREP(OP_LDF_BMASK, bmask) |
0502         FIELD_PREP(OP_LDF_SHF, shift) |
0503         FIELD_PREP(OP_LDF_WR_AB, wr_both) |
0504         FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
0505         FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
0506 
0507     nfp_prog_push(nfp_prog, insn);
0508 }
0509 
0510 static void
0511 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
0512           enum shf_sc sc, u8 shift, bool zero)
0513 {
0514     struct nfp_insn_re_regs reg;
0515     int err;
0516 
0517     /* Note: ld_field is special as it uses one of the src regs as dst */
0518     err = swreg_to_restricted(dst, dst, src, &reg, true);
0519     if (err) {
0520         nfp_prog->error = err;
0521         return;
0522     }
0523 
0524     __emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
0525             reg.i8, zero, reg.swap, reg.wr_both,
0526             reg.dst_lmextn, reg.src_lmextn);
0527 }
0528 
0529 static void
0530 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
0531           enum shf_sc sc, u8 shift)
0532 {
0533     emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
0534 }
0535 
0536 static void
0537 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
0538         bool dst_lmextn, bool src_lmextn)
0539 {
0540     u64 insn;
0541 
0542     insn = OP_LCSR_BASE |
0543         FIELD_PREP(OP_LCSR_A_SRC, areg) |
0544         FIELD_PREP(OP_LCSR_B_SRC, breg) |
0545         FIELD_PREP(OP_LCSR_WRITE, wr) |
0546         FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
0547         FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
0548         FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
0549 
0550     nfp_prog_push(nfp_prog, insn);
0551 }
0552 
0553 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
0554 {
0555     struct nfp_insn_ur_regs reg;
0556     int err;
0557 
0558     /* This instruction takes immeds instead of reg_none() for the ignored
0559      * operand, but we can't encode 2 immeds in one instr with our normal
0560      * swreg infra so if param is an immed, we encode as reg_none() and
0561      * copy the immed to both operands.
0562      */
0563     if (swreg_type(src) == NN_REG_IMM) {
0564         err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
0565         reg.breg = reg.areg;
0566     } else {
0567         err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
0568     }
0569     if (err) {
0570         nfp_prog->error = err;
0571         return;
0572     }
0573 
0574     __emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
0575             false, reg.src_lmextn);
0576 }
0577 
0578 /* CSR value is read in following immed[gpr, 0] */
0579 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
0580 {
0581     __emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
0582 }
0583 
0584 static void emit_nop(struct nfp_prog *nfp_prog)
0585 {
0586     __emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
0587 }
0588 
0589 /* --- Wrappers --- */
0590 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
0591 {
0592     if (!(imm & 0xffff0000)) {
0593         *val = imm;
0594         *shift = IMMED_SHIFT_0B;
0595     } else if (!(imm & 0xff0000ff)) {
0596         *val = imm >> 8;
0597         *shift = IMMED_SHIFT_1B;
0598     } else if (!(imm & 0x0000ffff)) {
0599         *val = imm >> 16;
0600         *shift = IMMED_SHIFT_2B;
0601     } else {
0602         return false;
0603     }
0604 
0605     return true;
0606 }
0607 
0608 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
0609 {
0610     enum immed_shift shift;
0611     u16 val;
0612 
0613     if (pack_immed(imm, &val, &shift)) {
0614         emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
0615     } else if (pack_immed(~imm, &val, &shift)) {
0616         emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
0617     } else {
0618         emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
0619                false, IMMED_SHIFT_0B);
0620         emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
0621                false, IMMED_SHIFT_2B);
0622     }
0623 }
0624 
0625 static void
0626 wrp_zext(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst)
0627 {
0628     if (meta->flags & FLAG_INSN_DO_ZEXT)
0629         wrp_immed(nfp_prog, reg_both(dst + 1), 0);
0630 }
0631 
0632 static void
0633 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
0634            enum nfp_relo_type relo)
0635 {
0636     if (imm > 0xffff) {
0637         pr_err("relocation of a large immediate!\n");
0638         nfp_prog->error = -EFAULT;
0639         return;
0640     }
0641     emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
0642 
0643     nfp_prog->prog[nfp_prog->prog_len - 1] |=
0644         FIELD_PREP(OP_RELO_TYPE, relo);
0645 }
0646 
0647 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
0648  * If the @imm is small enough encode it directly in operand and return
0649  * otherwise load @imm to a spare register and return its encoding.
0650  */
0651 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
0652 {
0653     if (FIELD_FIT(UR_REG_IMM_MAX, imm))
0654         return reg_imm(imm);
0655 
0656     wrp_immed(nfp_prog, tmp_reg, imm);
0657     return tmp_reg;
0658 }
0659 
0660 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
0661  * If the @imm is small enough encode it directly in operand and return
0662  * otherwise load @imm to a spare register and return its encoding.
0663  */
0664 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
0665 {
0666     if (FIELD_FIT(RE_REG_IMM_MAX, imm))
0667         return reg_imm(imm);
0668 
0669     wrp_immed(nfp_prog, tmp_reg, imm);
0670     return tmp_reg;
0671 }
0672 
0673 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
0674 {
0675     while (count--)
0676         emit_nop(nfp_prog);
0677 }
0678 
0679 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
0680 {
0681     emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
0682 }
0683 
0684 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
0685 {
0686     wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
0687 }
0688 
0689 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
0690  * result to @dst from low end.
0691  */
0692 static void
0693 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
0694         u8 offset)
0695 {
0696     enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
0697     u8 mask = (1 << field_len) - 1;
0698 
0699     emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
0700 }
0701 
0702 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
0703  * result to @dst from offset, there is no change on the other bits of @dst.
0704  */
0705 static void
0706 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
0707            u8 field_len, u8 offset)
0708 {
0709     enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
0710     u8 mask = ((1 << field_len) - 1) << offset;
0711 
0712     emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
0713 }
0714 
0715 static void
0716 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
0717           swreg *rega, swreg *regb)
0718 {
0719     if (offset == reg_imm(0)) {
0720         *rega = reg_a(src_gpr);
0721         *regb = reg_b(src_gpr + 1);
0722         return;
0723     }
0724 
0725     emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
0726     emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
0727          reg_imm(0));
0728     *rega = imm_a(nfp_prog);
0729     *regb = imm_b(nfp_prog);
0730 }
0731 
0732 /* NFP has Command Push Pull bus which supports bluk memory operations. */
0733 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
0734 {
0735     bool descending_seq = meta->ldst_gather_len < 0;
0736     s16 len = abs(meta->ldst_gather_len);
0737     swreg src_base, off;
0738     bool src_40bit_addr;
0739     unsigned int i;
0740     u8 xfer_num;
0741 
0742     off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
0743     src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
0744     src_base = reg_a(meta->insn.src_reg * 2);
0745     xfer_num = round_up(len, 4) / 4;
0746 
0747     if (src_40bit_addr)
0748         addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
0749                   &off);
0750 
0751     /* Setup PREV_ALU fields to override memory read length. */
0752     if (len > 32)
0753         wrp_immed(nfp_prog, reg_none(),
0754               CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
0755 
0756     /* Memory read from source addr into transfer-in registers. */
0757     emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
0758              src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
0759              src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
0760 
0761     /* Move from transfer-in to transfer-out. */
0762     for (i = 0; i < xfer_num; i++)
0763         wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
0764 
0765     off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
0766 
0767     if (len <= 8) {
0768         /* Use single direct_ref write8. */
0769         emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
0770              reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
0771              CMD_CTX_SWAP);
0772     } else if (len <= 32 && IS_ALIGNED(len, 4)) {
0773         /* Use single direct_ref write32. */
0774         emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
0775              reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
0776              CMD_CTX_SWAP);
0777     } else if (len <= 32) {
0778         /* Use single indirect_ref write8. */
0779         wrp_immed(nfp_prog, reg_none(),
0780               CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
0781         emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
0782                    reg_a(meta->paired_st->dst_reg * 2), off,
0783                    len - 1, CMD_CTX_SWAP);
0784     } else if (IS_ALIGNED(len, 4)) {
0785         /* Use single indirect_ref write32. */
0786         wrp_immed(nfp_prog, reg_none(),
0787               CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
0788         emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
0789                    reg_a(meta->paired_st->dst_reg * 2), off,
0790                    xfer_num - 1, CMD_CTX_SWAP);
0791     } else if (len <= 40) {
0792         /* Use one direct_ref write32 to write the first 32-bytes, then
0793          * another direct_ref write8 to write the remaining bytes.
0794          */
0795         emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
0796              reg_a(meta->paired_st->dst_reg * 2), off, 7,
0797              CMD_CTX_SWAP);
0798 
0799         off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
0800                       imm_b(nfp_prog));
0801         emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
0802              reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
0803              CMD_CTX_SWAP);
0804     } else {
0805         /* Use one indirect_ref write32 to write 4-bytes aligned length,
0806          * then another direct_ref write8 to write the remaining bytes.
0807          */
0808         u8 new_off;
0809 
0810         wrp_immed(nfp_prog, reg_none(),
0811               CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
0812         emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
0813                    reg_a(meta->paired_st->dst_reg * 2), off,
0814                    xfer_num - 2, CMD_CTX_SWAP);
0815         new_off = meta->paired_st->off + (xfer_num - 1) * 4;
0816         off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
0817         emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
0818              xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
0819              (len & 0x3) - 1, CMD_CTX_SWAP);
0820     }
0821 
0822     /* TODO: The following extra load is to make sure data flow be identical
0823      *  before and after we do memory copy optimization.
0824      *
0825      *  The load destination register is not guaranteed to be dead, so we
0826      *  need to make sure it is loaded with the value the same as before
0827      *  this transformation.
0828      *
0829      *  These extra loads could be removed once we have accurate register
0830      *  usage information.
0831      */
0832     if (descending_seq)
0833         xfer_num = 0;
0834     else if (BPF_SIZE(meta->insn.code) != BPF_DW)
0835         xfer_num = xfer_num - 1;
0836     else
0837         xfer_num = xfer_num - 2;
0838 
0839     switch (BPF_SIZE(meta->insn.code)) {
0840     case BPF_B:
0841         wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
0842                 reg_xfer(xfer_num), 1,
0843                 IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
0844         break;
0845     case BPF_H:
0846         wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
0847                 reg_xfer(xfer_num), 2, (len & 3) ^ 2);
0848         break;
0849     case BPF_W:
0850         wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
0851             reg_xfer(0));
0852         break;
0853     case BPF_DW:
0854         wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
0855             reg_xfer(xfer_num));
0856         wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
0857             reg_xfer(xfer_num + 1));
0858         break;
0859     }
0860 
0861     if (BPF_SIZE(meta->insn.code) != BPF_DW)
0862         wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
0863 
0864     return 0;
0865 }
0866 
0867 static int
0868 data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, swreg offset,
0869     u8 dst_gpr, int size)
0870 {
0871     unsigned int i;
0872     u16 shift, sz;
0873 
0874     /* We load the value from the address indicated in @offset and then
0875      * shift out the data we don't need.  Note: this is big endian!
0876      */
0877     sz = max(size, 4);
0878     shift = size < 4 ? 4 - size : 0;
0879 
0880     emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
0881          pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
0882 
0883     i = 0;
0884     if (shift)
0885         emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
0886              reg_xfer(0), SHF_SC_R_SHF, shift * 8);
0887     else
0888         for (; i * 4 < size; i++)
0889             wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
0890 
0891     if (i < 2)
0892         wrp_zext(nfp_prog, meta, dst_gpr);
0893 
0894     return 0;
0895 }
0896 
0897 static int
0898 data_ld_host_order(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
0899            u8 dst_gpr, swreg lreg, swreg rreg, int size,
0900            enum cmd_mode mode)
0901 {
0902     unsigned int i;
0903     u8 mask, sz;
0904 
0905     /* We load the value from the address indicated in rreg + lreg and then
0906      * mask out the data we don't need.  Note: this is little endian!
0907      */
0908     sz = max(size, 4);
0909     mask = size < 4 ? GENMASK(size - 1, 0) : 0;
0910 
0911     emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
0912          lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
0913 
0914     i = 0;
0915     if (mask)
0916         emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
0917                   reg_xfer(0), SHF_SC_NONE, 0, true);
0918     else
0919         for (; i * 4 < size; i++)
0920             wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
0921 
0922     if (i < 2)
0923         wrp_zext(nfp_prog, meta, dst_gpr);
0924 
0925     return 0;
0926 }
0927 
0928 static int
0929 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
0930               u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
0931 {
0932     return data_ld_host_order(nfp_prog, meta, dst_gpr, reg_a(src_gpr),
0933                   offset, size, CMD_MODE_32b);
0934 }
0935 
0936 static int
0937 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
0938               u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
0939 {
0940     swreg rega, regb;
0941 
0942     addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
0943 
0944     return data_ld_host_order(nfp_prog, meta, dst_gpr, rega, regb,
0945                   size, CMD_MODE_40b_BA);
0946 }
0947 
0948 static int
0949 construct_data_ind_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
0950               u16 offset, u16 src, u8 size)
0951 {
0952     swreg tmp_reg;
0953 
0954     /* Calculate the true offset (src_reg + imm) */
0955     tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
0956     emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
0957 
0958     /* Check packet length (size guaranteed to fit b/c it's u8) */
0959     emit_alu(nfp_prog, imm_a(nfp_prog),
0960          imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
0961     emit_alu(nfp_prog, reg_none(),
0962          plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
0963     emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
0964 
0965     /* Load data */
0966     return data_ld(nfp_prog, meta, imm_b(nfp_prog), 0, size);
0967 }
0968 
0969 static int
0970 construct_data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
0971           u16 offset, u8 size)
0972 {
0973     swreg tmp_reg;
0974 
0975     /* Check packet length */
0976     tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
0977     emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
0978     emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
0979 
0980     /* Load data */
0981     tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
0982     return data_ld(nfp_prog, meta, tmp_reg, 0, size);
0983 }
0984 
0985 static int
0986 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
0987             u8 src_gpr, u8 size)
0988 {
0989     unsigned int i;
0990 
0991     for (i = 0; i * 4 < size; i++)
0992         wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
0993 
0994     emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
0995          reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
0996 
0997     return 0;
0998 }
0999 
1000 static int
1001 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
1002            u64 imm, u8 size)
1003 {
1004     wrp_immed(nfp_prog, reg_xfer(0), imm);
1005     if (size == 8)
1006         wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
1007 
1008     emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1009          reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1010 
1011     return 0;
1012 }
1013 
1014 typedef int
1015 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
1016          unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1017          bool needs_inc);
1018 
1019 static int
1020 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
1021           unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1022           bool needs_inc)
1023 {
1024     bool should_inc = needs_inc && new_gpr && !last;
1025     u32 idx, src_byte;
1026     enum shf_sc sc;
1027     swreg reg;
1028     int shf;
1029     u8 mask;
1030 
1031     if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1032         return -EOPNOTSUPP;
1033 
1034     idx = off / 4;
1035 
1036     /* Move the entire word */
1037     if (size == 4) {
1038         wrp_mov(nfp_prog, reg_both(dst),
1039             should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1040         return 0;
1041     }
1042 
1043     if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1044         return -EOPNOTSUPP;
1045 
1046     src_byte = off % 4;
1047 
1048     mask = (1 << size) - 1;
1049     mask <<= dst_byte;
1050 
1051     if (WARN_ON_ONCE(mask > 0xf))
1052         return -EOPNOTSUPP;
1053 
1054     shf = abs(src_byte - dst_byte) * 8;
1055     if (src_byte == dst_byte) {
1056         sc = SHF_SC_NONE;
1057     } else if (src_byte < dst_byte) {
1058         shf = 32 - shf;
1059         sc = SHF_SC_L_SHF;
1060     } else {
1061         sc = SHF_SC_R_SHF;
1062     }
1063 
1064     /* ld_field can address fewer indexes, if offset too large do RMW.
1065      * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1066      */
1067     if (idx <= RE_REG_LM_IDX_MAX) {
1068         reg = reg_lm(lm3 ? 3 : 0, idx);
1069     } else {
1070         reg = imm_a(nfp_prog);
1071         /* If it's not the first part of the load and we start a new GPR
1072          * that means we are loading a second part of the LMEM word into
1073          * a new GPR.  IOW we've already looked that LMEM word and
1074          * therefore it has been loaded into imm_a().
1075          */
1076         if (first || !new_gpr)
1077             wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1078     }
1079 
1080     emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1081 
1082     if (should_inc)
1083         wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1084 
1085     return 0;
1086 }
1087 
1088 static int
1089 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1090            unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1091            bool needs_inc)
1092 {
1093     bool should_inc = needs_inc && new_gpr && !last;
1094     u32 idx, dst_byte;
1095     enum shf_sc sc;
1096     swreg reg;
1097     int shf;
1098     u8 mask;
1099 
1100     if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1101         return -EOPNOTSUPP;
1102 
1103     idx = off / 4;
1104 
1105     /* Move the entire word */
1106     if (size == 4) {
1107         wrp_mov(nfp_prog,
1108             should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1109             reg_b(src));
1110         return 0;
1111     }
1112 
1113     if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1114         return -EOPNOTSUPP;
1115 
1116     dst_byte = off % 4;
1117 
1118     mask = (1 << size) - 1;
1119     mask <<= dst_byte;
1120 
1121     if (WARN_ON_ONCE(mask > 0xf))
1122         return -EOPNOTSUPP;
1123 
1124     shf = abs(src_byte - dst_byte) * 8;
1125     if (src_byte == dst_byte) {
1126         sc = SHF_SC_NONE;
1127     } else if (src_byte < dst_byte) {
1128         shf = 32 - shf;
1129         sc = SHF_SC_L_SHF;
1130     } else {
1131         sc = SHF_SC_R_SHF;
1132     }
1133 
1134     /* ld_field can address fewer indexes, if offset too large do RMW.
1135      * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1136      */
1137     if (idx <= RE_REG_LM_IDX_MAX) {
1138         reg = reg_lm(lm3 ? 3 : 0, idx);
1139     } else {
1140         reg = imm_a(nfp_prog);
1141         /* Only first and last LMEM locations are going to need RMW,
1142          * the middle location will be overwritten fully.
1143          */
1144         if (first || last)
1145             wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1146     }
1147 
1148     emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1149 
1150     if (new_gpr || last) {
1151         if (idx > RE_REG_LM_IDX_MAX)
1152             wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1153         if (should_inc)
1154             wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1155     }
1156 
1157     return 0;
1158 }
1159 
1160 static int
1161 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1162          unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1163          bool clr_gpr, lmem_step step)
1164 {
1165     s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1166     bool first = true, narrow_ld, last;
1167     bool needs_inc = false;
1168     swreg stack_off_reg;
1169     u8 prev_gpr = 255;
1170     u32 gpr_byte = 0;
1171     bool lm3 = true;
1172     int ret;
1173 
1174     if (meta->ptr_not_const ||
1175         meta->flags & FLAG_INSN_PTR_CALLER_STACK_FRAME) {
1176         /* Use of the last encountered ptr_off is OK, they all have
1177          * the same alignment.  Depend on low bits of value being
1178          * discarded when written to LMaddr register.
1179          */
1180         stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1181                         stack_imm(nfp_prog));
1182 
1183         emit_alu(nfp_prog, imm_b(nfp_prog),
1184              reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1185 
1186         needs_inc = true;
1187     } else if (off + size <= 64) {
1188         /* We can reach bottom 64B with LMaddr0 */
1189         lm3 = false;
1190     } else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1191         /* We have to set up a new pointer.  If we know the offset
1192          * and the entire access falls into a single 32 byte aligned
1193          * window we won't have to increment the LM pointer.
1194          * The 32 byte alignment is imporant because offset is ORed in
1195          * not added when doing *l$indexN[off].
1196          */
1197         stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1198                         stack_imm(nfp_prog));
1199         emit_alu(nfp_prog, imm_b(nfp_prog),
1200              stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1201 
1202         off %= 32;
1203     } else {
1204         stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1205                         stack_imm(nfp_prog));
1206 
1207         emit_alu(nfp_prog, imm_b(nfp_prog),
1208              stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1209 
1210         needs_inc = true;
1211     }
1212 
1213     narrow_ld = clr_gpr && size < 8;
1214 
1215     if (lm3) {
1216         unsigned int nop_cnt;
1217 
1218         emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1219         /* For size < 4 one slot will be filled by zeroing of upper,
1220          * but be careful, that zeroing could be eliminated by zext
1221          * optimization.
1222          */
1223         nop_cnt = narrow_ld && meta->flags & FLAG_INSN_DO_ZEXT ? 2 : 3;
1224         wrp_nops(nfp_prog, nop_cnt);
1225     }
1226 
1227     if (narrow_ld)
1228         wrp_zext(nfp_prog, meta, gpr);
1229 
1230     while (size) {
1231         u32 slice_end;
1232         u8 slice_size;
1233 
1234         slice_size = min(size, 4 - gpr_byte);
1235         slice_end = min(off + slice_size, round_up(off + 1, 4));
1236         slice_size = slice_end - off;
1237 
1238         last = slice_size == size;
1239 
1240         if (needs_inc)
1241             off %= 4;
1242 
1243         ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1244                first, gpr != prev_gpr, last, lm3, needs_inc);
1245         if (ret)
1246             return ret;
1247 
1248         prev_gpr = gpr;
1249         first = false;
1250 
1251         gpr_byte += slice_size;
1252         if (gpr_byte >= 4) {
1253             gpr_byte -= 4;
1254             gpr++;
1255         }
1256 
1257         size -= slice_size;
1258         off += slice_size;
1259     }
1260 
1261     return 0;
1262 }
1263 
1264 static void
1265 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1266 {
1267     swreg tmp_reg;
1268 
1269     if (alu_op == ALU_OP_AND) {
1270         if (!imm)
1271             wrp_immed(nfp_prog, reg_both(dst), 0);
1272         if (!imm || !~imm)
1273             return;
1274     }
1275     if (alu_op == ALU_OP_OR) {
1276         if (!~imm)
1277             wrp_immed(nfp_prog, reg_both(dst), ~0U);
1278         if (!imm || !~imm)
1279             return;
1280     }
1281     if (alu_op == ALU_OP_XOR) {
1282         if (!~imm)
1283             emit_alu(nfp_prog, reg_both(dst), reg_none(),
1284                  ALU_OP_NOT, reg_b(dst));
1285         if (!imm || !~imm)
1286             return;
1287     }
1288 
1289     tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1290     emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1291 }
1292 
1293 static int
1294 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1295           enum alu_op alu_op, bool skip)
1296 {
1297     const struct bpf_insn *insn = &meta->insn;
1298     u64 imm = insn->imm; /* sign extend */
1299 
1300     if (skip) {
1301         meta->flags |= FLAG_INSN_SKIP_NOOP;
1302         return 0;
1303     }
1304 
1305     wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1306     wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1307 
1308     return 0;
1309 }
1310 
1311 static int
1312 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1313           enum alu_op alu_op)
1314 {
1315     u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1316 
1317     emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1318     emit_alu(nfp_prog, reg_both(dst + 1),
1319          reg_a(dst + 1), alu_op, reg_b(src + 1));
1320 
1321     return 0;
1322 }
1323 
1324 static int
1325 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1326           enum alu_op alu_op)
1327 {
1328     const struct bpf_insn *insn = &meta->insn;
1329     u8 dst = insn->dst_reg * 2;
1330 
1331     wrp_alu_imm(nfp_prog, dst, alu_op, insn->imm);
1332     wrp_zext(nfp_prog, meta, dst);
1333 
1334     return 0;
1335 }
1336 
1337 static int
1338 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1339           enum alu_op alu_op)
1340 {
1341     u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1342 
1343     emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1344     wrp_zext(nfp_prog, meta, dst);
1345 
1346     return 0;
1347 }
1348 
1349 static void
1350 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1351          enum br_mask br_mask, u16 off)
1352 {
1353     emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1354     emit_br(nfp_prog, br_mask, off, 0);
1355 }
1356 
1357 static int
1358 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1359          enum alu_op alu_op, enum br_mask br_mask)
1360 {
1361     const struct bpf_insn *insn = &meta->insn;
1362 
1363     wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1364              insn->src_reg * 2, br_mask, insn->off);
1365     if (is_mbpf_jmp64(meta))
1366         wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1367                  insn->src_reg * 2 + 1, br_mask, insn->off);
1368 
1369     return 0;
1370 }
1371 
1372 static const struct jmp_code_map {
1373     enum br_mask br_mask;
1374     bool swap;
1375 } jmp_code_map[] = {
1376     [BPF_JGT >> 4]  = { BR_BLO, true },
1377     [BPF_JGE >> 4]  = { BR_BHS, false },
1378     [BPF_JLT >> 4]  = { BR_BLO, false },
1379     [BPF_JLE >> 4]  = { BR_BHS, true },
1380     [BPF_JSGT >> 4] = { BR_BLT, true },
1381     [BPF_JSGE >> 4] = { BR_BGE, false },
1382     [BPF_JSLT >> 4] = { BR_BLT, false },
1383     [BPF_JSLE >> 4] = { BR_BGE, true },
1384 };
1385 
1386 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1387 {
1388     unsigned int op;
1389 
1390     op = BPF_OP(meta->insn.code) >> 4;
1391     /* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1392     if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1393               !jmp_code_map[op].br_mask,
1394               "no code found for jump instruction"))
1395         return NULL;
1396 
1397     return &jmp_code_map[op];
1398 }
1399 
1400 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1401 {
1402     const struct bpf_insn *insn = &meta->insn;
1403     u64 imm = insn->imm; /* sign extend */
1404     const struct jmp_code_map *code;
1405     enum alu_op alu_op, carry_op;
1406     u8 reg = insn->dst_reg * 2;
1407     swreg tmp_reg;
1408 
1409     code = nfp_jmp_code_get(meta);
1410     if (!code)
1411         return -EINVAL;
1412 
1413     alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1414     carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1415 
1416     tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1417     if (!code->swap)
1418         emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1419     else
1420         emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1421 
1422     if (is_mbpf_jmp64(meta)) {
1423         tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1424         if (!code->swap)
1425             emit_alu(nfp_prog, reg_none(),
1426                  reg_a(reg + 1), carry_op, tmp_reg);
1427         else
1428             emit_alu(nfp_prog, reg_none(),
1429                  tmp_reg, carry_op, reg_a(reg + 1));
1430     }
1431 
1432     emit_br(nfp_prog, code->br_mask, insn->off, 0);
1433 
1434     return 0;
1435 }
1436 
1437 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1438 {
1439     const struct bpf_insn *insn = &meta->insn;
1440     const struct jmp_code_map *code;
1441     u8 areg, breg;
1442 
1443     code = nfp_jmp_code_get(meta);
1444     if (!code)
1445         return -EINVAL;
1446 
1447     areg = insn->dst_reg * 2;
1448     breg = insn->src_reg * 2;
1449 
1450     if (code->swap) {
1451         areg ^= breg;
1452         breg ^= areg;
1453         areg ^= breg;
1454     }
1455 
1456     emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1457     if (is_mbpf_jmp64(meta))
1458         emit_alu(nfp_prog, reg_none(),
1459              reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1460     emit_br(nfp_prog, code->br_mask, insn->off, 0);
1461 
1462     return 0;
1463 }
1464 
1465 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1466 {
1467     emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1468               SHF_SC_R_ROT, 8);
1469     emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1470               SHF_SC_R_ROT, 16);
1471 }
1472 
1473 static void
1474 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1475         swreg rreg, bool gen_high_half)
1476 {
1477     emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1478     emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1479     emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1480     emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1481     emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1482     emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1483     if (gen_high_half)
1484         emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1485              reg_none());
1486     else
1487         wrp_immed(nfp_prog, dst_hi, 0);
1488 }
1489 
1490 static void
1491 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1492         swreg rreg)
1493 {
1494     emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1495     emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1496     emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1497     emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1498 }
1499 
1500 static int
1501 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1502     bool gen_high_half, bool ropnd_from_reg)
1503 {
1504     swreg multiplier, multiplicand, dst_hi, dst_lo;
1505     const struct bpf_insn *insn = &meta->insn;
1506     u32 lopnd_max, ropnd_max;
1507     u8 dst_reg;
1508 
1509     dst_reg = insn->dst_reg;
1510     multiplicand = reg_a(dst_reg * 2);
1511     dst_hi = reg_both(dst_reg * 2 + 1);
1512     dst_lo = reg_both(dst_reg * 2);
1513     lopnd_max = meta->umax_dst;
1514     if (ropnd_from_reg) {
1515         multiplier = reg_b(insn->src_reg * 2);
1516         ropnd_max = meta->umax_src;
1517     } else {
1518         u32 imm = insn->imm;
1519 
1520         multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1521         ropnd_max = imm;
1522     }
1523     if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1524         wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1525                 gen_high_half);
1526     else
1527         wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1528 
1529     return 0;
1530 }
1531 
1532 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1533 {
1534     swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1535     struct reciprocal_value_adv rvalue;
1536     u8 pre_shift, exp;
1537     swreg magic;
1538 
1539     if (imm > U32_MAX) {
1540         wrp_immed(nfp_prog, dst_both, 0);
1541         return 0;
1542     }
1543 
1544     /* NOTE: because we are using "reciprocal_value_adv" which doesn't
1545      * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1546      * to handle such case which actually equals to the result of unsigned
1547      * comparison "dst >= imm" which could be calculated using the following
1548      * NFP sequence:
1549      *
1550      *  alu[--, dst, -, imm]
1551      *  immed[imm, 0]
1552      *  alu[dst, imm, +carry, 0]
1553      *
1554      */
1555     if (imm > 1U << 31) {
1556         swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1557 
1558         emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1559         wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1560         emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1561              reg_imm(0));
1562         return 0;
1563     }
1564 
1565     rvalue = reciprocal_value_adv(imm, 32);
1566     exp = rvalue.exp;
1567     if (rvalue.is_wide_m && !(imm & 1)) {
1568         pre_shift = fls(imm & -imm) - 1;
1569         rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1570     } else {
1571         pre_shift = 0;
1572     }
1573     magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1574     if (imm == 1U << exp) {
1575         emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1576              SHF_SC_R_SHF, exp);
1577     } else if (rvalue.is_wide_m) {
1578         wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1579                 magic, true);
1580         emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1581              imm_b(nfp_prog));
1582         emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1583              SHF_SC_R_SHF, 1);
1584         emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1585              imm_b(nfp_prog));
1586         emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1587              SHF_SC_R_SHF, rvalue.sh - 1);
1588     } else {
1589         if (pre_shift)
1590             emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1591                  dst_b, SHF_SC_R_SHF, pre_shift);
1592         wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1593         emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1594              dst_b, SHF_SC_R_SHF, rvalue.sh);
1595     }
1596 
1597     return 0;
1598 }
1599 
1600 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1601 {
1602     swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1603     struct nfp_bpf_cap_adjust_head *adjust_head;
1604     u32 ret_einval, end;
1605 
1606     adjust_head = &nfp_prog->bpf->adjust_head;
1607 
1608     /* Optimized version - 5 vs 14 cycles */
1609     if (nfp_prog->adjust_head_location != UINT_MAX) {
1610         if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1611             return -EINVAL;
1612 
1613         emit_alu(nfp_prog, pptr_reg(nfp_prog),
1614              reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1615         emit_alu(nfp_prog, plen_reg(nfp_prog),
1616              plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1617         emit_alu(nfp_prog, pv_len(nfp_prog),
1618              pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1619 
1620         wrp_immed(nfp_prog, reg_both(0), 0);
1621         wrp_immed(nfp_prog, reg_both(1), 0);
1622 
1623         /* TODO: when adjust head is guaranteed to succeed we can
1624          * also eliminate the following if (r0 == 0) branch.
1625          */
1626 
1627         return 0;
1628     }
1629 
1630     ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1631     end = ret_einval + 2;
1632 
1633     /* We need to use a temp because offset is just a part of the pkt ptr */
1634     emit_alu(nfp_prog, tmp,
1635          reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1636 
1637     /* Validate result will fit within FW datapath constraints */
1638     emit_alu(nfp_prog, reg_none(),
1639          tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1640     emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1641     emit_alu(nfp_prog, reg_none(),
1642          reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1643     emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1644 
1645     /* Validate the length is at least ETH_HLEN */
1646     emit_alu(nfp_prog, tmp_len,
1647          plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1648     emit_alu(nfp_prog, reg_none(),
1649          tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1650     emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1651 
1652     /* Load the ret code */
1653     wrp_immed(nfp_prog, reg_both(0), 0);
1654     wrp_immed(nfp_prog, reg_both(1), 0);
1655 
1656     /* Modify the packet metadata */
1657     emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1658 
1659     /* Skip over the -EINVAL ret code (defer 2) */
1660     emit_br(nfp_prog, BR_UNC, end, 2);
1661 
1662     emit_alu(nfp_prog, plen_reg(nfp_prog),
1663          plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1664     emit_alu(nfp_prog, pv_len(nfp_prog),
1665          pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1666 
1667     /* return -EINVAL target */
1668     if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1669         return -EINVAL;
1670 
1671     wrp_immed(nfp_prog, reg_both(0), -22);
1672     wrp_immed(nfp_prog, reg_both(1), ~0);
1673 
1674     if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1675         return -EINVAL;
1676 
1677     return 0;
1678 }
1679 
1680 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1681 {
1682     u32 ret_einval, end;
1683     swreg plen, delta;
1684 
1685     BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1686 
1687     plen = imm_a(nfp_prog);
1688     delta = reg_a(2 * 2);
1689 
1690     ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1691     end = nfp_prog_current_offset(nfp_prog) + 11;
1692 
1693     /* Calculate resulting length */
1694     emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1695     /* delta == 0 is not allowed by the kernel, add must overflow to make
1696      * length smaller.
1697      */
1698     emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1699 
1700     /* if (new_len < 14) then -EINVAL */
1701     emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1702     emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1703 
1704     emit_alu(nfp_prog, plen_reg(nfp_prog),
1705          plen_reg(nfp_prog), ALU_OP_ADD, delta);
1706     emit_alu(nfp_prog, pv_len(nfp_prog),
1707          pv_len(nfp_prog), ALU_OP_ADD, delta);
1708 
1709     emit_br(nfp_prog, BR_UNC, end, 2);
1710     wrp_immed(nfp_prog, reg_both(0), 0);
1711     wrp_immed(nfp_prog, reg_both(1), 0);
1712 
1713     if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1714         return -EINVAL;
1715 
1716     wrp_immed(nfp_prog, reg_both(0), -22);
1717     wrp_immed(nfp_prog, reg_both(1), ~0);
1718 
1719     if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1720         return -EINVAL;
1721 
1722     return 0;
1723 }
1724 
1725 static int
1726 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1727 {
1728     bool load_lm_ptr;
1729     u32 ret_tgt;
1730     s64 lm_off;
1731 
1732     /* We only have to reload LM0 if the key is not at start of stack */
1733     lm_off = nfp_prog->stack_frame_depth;
1734     lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1735     load_lm_ptr = meta->arg2.var_off || lm_off;
1736 
1737     /* Set LM0 to start of key */
1738     if (load_lm_ptr)
1739         emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1740     if (meta->func_id == BPF_FUNC_map_update_elem)
1741         emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1742 
1743     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1744              2, RELO_BR_HELPER);
1745     ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1746 
1747     /* Load map ID into A0 */
1748     wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1749 
1750     /* Load the return address into B0 */
1751     wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1752 
1753     if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1754         return -EINVAL;
1755 
1756     /* Reset the LM0 pointer */
1757     if (!load_lm_ptr)
1758         return 0;
1759 
1760     emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1761     wrp_nops(nfp_prog, 3);
1762 
1763     return 0;
1764 }
1765 
1766 static int
1767 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1768 {
1769     __emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1770     /* CSR value is read in following immed[gpr, 0] */
1771     emit_immed(nfp_prog, reg_both(0), 0,
1772            IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1773     emit_immed(nfp_prog, reg_both(1), 0,
1774            IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1775     return 0;
1776 }
1777 
1778 static int
1779 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1780 {
1781     swreg ptr_type;
1782     u32 ret_tgt;
1783 
1784     ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1785 
1786     ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1787 
1788     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1789              2, RELO_BR_HELPER);
1790 
1791     /* Load ptr type into A1 */
1792     wrp_mov(nfp_prog, reg_a(1), ptr_type);
1793 
1794     /* Load the return address into B0 */
1795     wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1796 
1797     if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1798         return -EINVAL;
1799 
1800     return 0;
1801 }
1802 
1803 static int
1804 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1805 {
1806     u32 jmp_tgt;
1807 
1808     jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1809 
1810     /* Make sure the queue id fits into FW field */
1811     emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1812          ALU_OP_AND_NOT_B, reg_imm(0xff));
1813     emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1814 
1815     /* Set the 'queue selected' bit and the queue value */
1816     emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1817          pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1818          SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1819     emit_ld_field(nfp_prog,
1820               pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1821               SHF_SC_NONE, 0);
1822     /* Delay slots end here, we will jump over next instruction if queue
1823      * value fits into the field.
1824      */
1825     emit_ld_field(nfp_prog,
1826               pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1827               SHF_SC_NONE, 0);
1828 
1829     if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1830         return -EINVAL;
1831 
1832     return 0;
1833 }
1834 
1835 /* --- Callbacks --- */
1836 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1837 {
1838     const struct bpf_insn *insn = &meta->insn;
1839     u8 dst = insn->dst_reg * 2;
1840     u8 src = insn->src_reg * 2;
1841 
1842     if (insn->src_reg == BPF_REG_10) {
1843         swreg stack_depth_reg;
1844 
1845         stack_depth_reg = ur_load_imm_any(nfp_prog,
1846                           nfp_prog->stack_frame_depth,
1847                           stack_imm(nfp_prog));
1848         emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1849              ALU_OP_ADD, stack_depth_reg);
1850         wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1851     } else {
1852         wrp_reg_mov(nfp_prog, dst, src);
1853         wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1854     }
1855 
1856     return 0;
1857 }
1858 
1859 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1860 {
1861     u64 imm = meta->insn.imm; /* sign extend */
1862 
1863     wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1864     wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1865 
1866     return 0;
1867 }
1868 
1869 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1870 {
1871     return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1872 }
1873 
1874 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1875 {
1876     return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1877 }
1878 
1879 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1880 {
1881     return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1882 }
1883 
1884 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1885 {
1886     return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1887 }
1888 
1889 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1890 {
1891     return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1892 }
1893 
1894 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1895 {
1896     return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1897 }
1898 
1899 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1900 {
1901     const struct bpf_insn *insn = &meta->insn;
1902 
1903     emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1904          reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1905          reg_b(insn->src_reg * 2));
1906     emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1907          reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1908          reg_b(insn->src_reg * 2 + 1));
1909 
1910     return 0;
1911 }
1912 
1913 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1914 {
1915     const struct bpf_insn *insn = &meta->insn;
1916     u64 imm = insn->imm; /* sign extend */
1917 
1918     wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1919     wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1920 
1921     return 0;
1922 }
1923 
1924 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1925 {
1926     const struct bpf_insn *insn = &meta->insn;
1927 
1928     emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1929          reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1930          reg_b(insn->src_reg * 2));
1931     emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1932          reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1933          reg_b(insn->src_reg * 2 + 1));
1934 
1935     return 0;
1936 }
1937 
1938 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1939 {
1940     const struct bpf_insn *insn = &meta->insn;
1941     u64 imm = insn->imm; /* sign extend */
1942 
1943     wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1944     wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1945 
1946     return 0;
1947 }
1948 
1949 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1950 {
1951     return wrp_mul(nfp_prog, meta, true, true);
1952 }
1953 
1954 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1955 {
1956     return wrp_mul(nfp_prog, meta, true, false);
1957 }
1958 
1959 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1960 {
1961     const struct bpf_insn *insn = &meta->insn;
1962 
1963     return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1964 }
1965 
1966 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1967 {
1968     /* NOTE: verifier hook has rejected cases for which verifier doesn't
1969      * know whether the source operand is constant or not.
1970      */
1971     return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1972 }
1973 
1974 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1975 {
1976     const struct bpf_insn *insn = &meta->insn;
1977 
1978     emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1979          ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1980     emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1981          ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1982 
1983     return 0;
1984 }
1985 
1986 /* Pseudo code:
1987  *   if shift_amt >= 32
1988  *     dst_high = dst_low << shift_amt[4:0]
1989  *     dst_low = 0;
1990  *   else
1991  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1992  *     dst_low = dst_low << shift_amt
1993  *
1994  * The indirect shift will use the same logic at runtime.
1995  */
1996 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1997 {
1998     if (!shift_amt)
1999         return 0;
2000 
2001     if (shift_amt < 32) {
2002         emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
2003              SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
2004              32 - shift_amt);
2005         emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2006              reg_b(dst), SHF_SC_L_SHF, shift_amt);
2007     } else if (shift_amt == 32) {
2008         wrp_reg_mov(nfp_prog, dst + 1, dst);
2009         wrp_immed(nfp_prog, reg_both(dst), 0);
2010     } else if (shift_amt > 32) {
2011         emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2012              reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
2013         wrp_immed(nfp_prog, reg_both(dst), 0);
2014     }
2015 
2016     return 0;
2017 }
2018 
2019 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2020 {
2021     const struct bpf_insn *insn = &meta->insn;
2022     u8 dst = insn->dst_reg * 2;
2023 
2024     return __shl_imm64(nfp_prog, dst, insn->imm);
2025 }
2026 
2027 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2028 {
2029     emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
2030          reg_b(src));
2031     emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
2032     emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
2033                reg_b(dst), SHF_SC_R_DSHF);
2034 }
2035 
2036 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
2037 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2038 {
2039     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2040     emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2041                reg_b(dst), SHF_SC_L_SHF);
2042 }
2043 
2044 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2045 {
2046     shl_reg64_lt32_high(nfp_prog, dst, src);
2047     shl_reg64_lt32_low(nfp_prog, dst, src);
2048 }
2049 
2050 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2051 {
2052     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2053     emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2054                reg_b(dst), SHF_SC_L_SHF);
2055     wrp_immed(nfp_prog, reg_both(dst), 0);
2056 }
2057 
2058 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2059 {
2060     const struct bpf_insn *insn = &meta->insn;
2061     u64 umin, umax;
2062     u8 dst, src;
2063 
2064     dst = insn->dst_reg * 2;
2065     umin = meta->umin_src;
2066     umax = meta->umax_src;
2067     if (umin == umax)
2068         return __shl_imm64(nfp_prog, dst, umin);
2069 
2070     src = insn->src_reg * 2;
2071     if (umax < 32) {
2072         shl_reg64_lt32(nfp_prog, dst, src);
2073     } else if (umin >= 32) {
2074         shl_reg64_ge32(nfp_prog, dst, src);
2075     } else {
2076         /* Generate different instruction sequences depending on runtime
2077          * value of shift amount.
2078          */
2079         u16 label_ge32, label_end;
2080 
2081         label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2082         emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2083 
2084         shl_reg64_lt32_high(nfp_prog, dst, src);
2085         label_end = nfp_prog_current_offset(nfp_prog) + 6;
2086         emit_br(nfp_prog, BR_UNC, label_end, 2);
2087         /* shl_reg64_lt32_low packed in delay slot. */
2088         shl_reg64_lt32_low(nfp_prog, dst, src);
2089 
2090         if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2091             return -EINVAL;
2092         shl_reg64_ge32(nfp_prog, dst, src);
2093 
2094         if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2095             return -EINVAL;
2096     }
2097 
2098     return 0;
2099 }
2100 
2101 /* Pseudo code:
2102  *   if shift_amt >= 32
2103  *     dst_high = 0;
2104  *     dst_low = dst_high >> shift_amt[4:0]
2105  *   else
2106  *     dst_high = dst_high >> shift_amt
2107  *     dst_low = (dst_high, dst_low) >> shift_amt
2108  *
2109  * The indirect shift will use the same logic at runtime.
2110  */
2111 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2112 {
2113     if (!shift_amt)
2114         return 0;
2115 
2116     if (shift_amt < 32) {
2117         emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2118              reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2119         emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2120              reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2121     } else if (shift_amt == 32) {
2122         wrp_reg_mov(nfp_prog, dst, dst + 1);
2123         wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2124     } else if (shift_amt > 32) {
2125         emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2126              reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2127         wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2128     }
2129 
2130     return 0;
2131 }
2132 
2133 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2134 {
2135     const struct bpf_insn *insn = &meta->insn;
2136     u8 dst = insn->dst_reg * 2;
2137 
2138     return __shr_imm64(nfp_prog, dst, insn->imm);
2139 }
2140 
2141 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2142 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2143 {
2144     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2145     emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2146                reg_b(dst + 1), SHF_SC_R_SHF);
2147 }
2148 
2149 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2150 {
2151     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2152     emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2153                reg_b(dst), SHF_SC_R_DSHF);
2154 }
2155 
2156 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2157 {
2158     shr_reg64_lt32_low(nfp_prog, dst, src);
2159     shr_reg64_lt32_high(nfp_prog, dst, src);
2160 }
2161 
2162 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2163 {
2164     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2165     emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2166                reg_b(dst + 1), SHF_SC_R_SHF);
2167     wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2168 }
2169 
2170 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2171 {
2172     const struct bpf_insn *insn = &meta->insn;
2173     u64 umin, umax;
2174     u8 dst, src;
2175 
2176     dst = insn->dst_reg * 2;
2177     umin = meta->umin_src;
2178     umax = meta->umax_src;
2179     if (umin == umax)
2180         return __shr_imm64(nfp_prog, dst, umin);
2181 
2182     src = insn->src_reg * 2;
2183     if (umax < 32) {
2184         shr_reg64_lt32(nfp_prog, dst, src);
2185     } else if (umin >= 32) {
2186         shr_reg64_ge32(nfp_prog, dst, src);
2187     } else {
2188         /* Generate different instruction sequences depending on runtime
2189          * value of shift amount.
2190          */
2191         u16 label_ge32, label_end;
2192 
2193         label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2194         emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2195         shr_reg64_lt32_low(nfp_prog, dst, src);
2196         label_end = nfp_prog_current_offset(nfp_prog) + 6;
2197         emit_br(nfp_prog, BR_UNC, label_end, 2);
2198         /* shr_reg64_lt32_high packed in delay slot. */
2199         shr_reg64_lt32_high(nfp_prog, dst, src);
2200 
2201         if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2202             return -EINVAL;
2203         shr_reg64_ge32(nfp_prog, dst, src);
2204 
2205         if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2206             return -EINVAL;
2207     }
2208 
2209     return 0;
2210 }
2211 
2212 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2213  * told through PREV_ALU result.
2214  */
2215 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2216 {
2217     if (!shift_amt)
2218         return 0;
2219 
2220     if (shift_amt < 32) {
2221         emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2222              reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2223         /* Set signedness bit. */
2224         emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2225              reg_imm(0));
2226         emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2227              reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2228     } else if (shift_amt == 32) {
2229         /* NOTE: this also helps setting signedness bit. */
2230         wrp_reg_mov(nfp_prog, dst, dst + 1);
2231         emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2232              reg_b(dst + 1), SHF_SC_R_SHF, 31);
2233     } else if (shift_amt > 32) {
2234         emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2235              reg_imm(0));
2236         emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2237              reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2238         emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2239              reg_b(dst + 1), SHF_SC_R_SHF, 31);
2240     }
2241 
2242     return 0;
2243 }
2244 
2245 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2246 {
2247     const struct bpf_insn *insn = &meta->insn;
2248     u8 dst = insn->dst_reg * 2;
2249 
2250     return __ashr_imm64(nfp_prog, dst, insn->imm);
2251 }
2252 
2253 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2254 {
2255     /* NOTE: the first insn will set both indirect shift amount (source A)
2256      * and signedness bit (MSB of result).
2257      */
2258     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2259     emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2260                reg_b(dst + 1), SHF_SC_R_SHF);
2261 }
2262 
2263 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2264 {
2265     /* NOTE: it is the same as logic shift because we don't need to shift in
2266      * signedness bit when the shift amount is less than 32.
2267      */
2268     return shr_reg64_lt32_low(nfp_prog, dst, src);
2269 }
2270 
2271 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2272 {
2273     ashr_reg64_lt32_low(nfp_prog, dst, src);
2274     ashr_reg64_lt32_high(nfp_prog, dst, src);
2275 }
2276 
2277 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2278 {
2279     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2280     emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2281                reg_b(dst + 1), SHF_SC_R_SHF);
2282     emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2283          reg_b(dst + 1), SHF_SC_R_SHF, 31);
2284 }
2285 
2286 /* Like ashr_imm64, but need to use indirect shift. */
2287 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2288 {
2289     const struct bpf_insn *insn = &meta->insn;
2290     u64 umin, umax;
2291     u8 dst, src;
2292 
2293     dst = insn->dst_reg * 2;
2294     umin = meta->umin_src;
2295     umax = meta->umax_src;
2296     if (umin == umax)
2297         return __ashr_imm64(nfp_prog, dst, umin);
2298 
2299     src = insn->src_reg * 2;
2300     if (umax < 32) {
2301         ashr_reg64_lt32(nfp_prog, dst, src);
2302     } else if (umin >= 32) {
2303         ashr_reg64_ge32(nfp_prog, dst, src);
2304     } else {
2305         u16 label_ge32, label_end;
2306 
2307         label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2308         emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2309         ashr_reg64_lt32_low(nfp_prog, dst, src);
2310         label_end = nfp_prog_current_offset(nfp_prog) + 6;
2311         emit_br(nfp_prog, BR_UNC, label_end, 2);
2312         /* ashr_reg64_lt32_high packed in delay slot. */
2313         ashr_reg64_lt32_high(nfp_prog, dst, src);
2314 
2315         if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2316             return -EINVAL;
2317         ashr_reg64_ge32(nfp_prog, dst, src);
2318 
2319         if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2320             return -EINVAL;
2321     }
2322 
2323     return 0;
2324 }
2325 
2326 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2327 {
2328     const struct bpf_insn *insn = &meta->insn;
2329 
2330     wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2331     wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2332 
2333     return 0;
2334 }
2335 
2336 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2337 {
2338     const struct bpf_insn *insn = &meta->insn;
2339 
2340     wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2341     wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2342 
2343     return 0;
2344 }
2345 
2346 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2347 {
2348     return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2349 }
2350 
2351 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2352 {
2353     return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR);
2354 }
2355 
2356 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2357 {
2358     return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2359 }
2360 
2361 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2362 {
2363     return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND);
2364 }
2365 
2366 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2367 {
2368     return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2369 }
2370 
2371 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2372 {
2373     return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR);
2374 }
2375 
2376 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2377 {
2378     return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2379 }
2380 
2381 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2382 {
2383     return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD);
2384 }
2385 
2386 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2387 {
2388     return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2389 }
2390 
2391 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2392 {
2393     return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB);
2394 }
2395 
2396 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2397 {
2398     return wrp_mul(nfp_prog, meta, false, true);
2399 }
2400 
2401 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2402 {
2403     return wrp_mul(nfp_prog, meta, false, false);
2404 }
2405 
2406 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2407 {
2408     return div_reg64(nfp_prog, meta);
2409 }
2410 
2411 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2412 {
2413     return div_imm64(nfp_prog, meta);
2414 }
2415 
2416 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2417 {
2418     u8 dst = meta->insn.dst_reg * 2;
2419 
2420     emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2421     wrp_zext(nfp_prog, meta, dst);
2422 
2423     return 0;
2424 }
2425 
2426 static int
2427 __ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2428        u8 shift_amt)
2429 {
2430     if (shift_amt) {
2431         /* Set signedness bit (MSB of result). */
2432         emit_alu(nfp_prog, reg_none(), reg_a(dst), ALU_OP_OR,
2433              reg_imm(0));
2434         emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2435              reg_b(dst), SHF_SC_R_SHF, shift_amt);
2436     }
2437     wrp_zext(nfp_prog, meta, dst);
2438 
2439     return 0;
2440 }
2441 
2442 static int ashr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2443 {
2444     const struct bpf_insn *insn = &meta->insn;
2445     u64 umin, umax;
2446     u8 dst, src;
2447 
2448     dst = insn->dst_reg * 2;
2449     umin = meta->umin_src;
2450     umax = meta->umax_src;
2451     if (umin == umax)
2452         return __ashr_imm(nfp_prog, meta, dst, umin);
2453 
2454     src = insn->src_reg * 2;
2455     /* NOTE: the first insn will set both indirect shift amount (source A)
2456      * and signedness bit (MSB of result).
2457      */
2458     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst));
2459     emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2460                reg_b(dst), SHF_SC_R_SHF);
2461     wrp_zext(nfp_prog, meta, dst);
2462 
2463     return 0;
2464 }
2465 
2466 static int ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2467 {
2468     const struct bpf_insn *insn = &meta->insn;
2469     u8 dst = insn->dst_reg * 2;
2470 
2471     return __ashr_imm(nfp_prog, meta, dst, insn->imm);
2472 }
2473 
2474 static int
2475 __shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2476       u8 shift_amt)
2477 {
2478     if (shift_amt)
2479         emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2480              reg_b(dst), SHF_SC_R_SHF, shift_amt);
2481     wrp_zext(nfp_prog, meta, dst);
2482     return 0;
2483 }
2484 
2485 static int shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2486 {
2487     const struct bpf_insn *insn = &meta->insn;
2488     u8 dst = insn->dst_reg * 2;
2489 
2490     return __shr_imm(nfp_prog, meta, dst, insn->imm);
2491 }
2492 
2493 static int shr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2494 {
2495     const struct bpf_insn *insn = &meta->insn;
2496     u64 umin, umax;
2497     u8 dst, src;
2498 
2499     dst = insn->dst_reg * 2;
2500     umin = meta->umin_src;
2501     umax = meta->umax_src;
2502     if (umin == umax)
2503         return __shr_imm(nfp_prog, meta, dst, umin);
2504 
2505     src = insn->src_reg * 2;
2506     emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2507     emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2508                reg_b(dst), SHF_SC_R_SHF);
2509     wrp_zext(nfp_prog, meta, dst);
2510     return 0;
2511 }
2512 
2513 static int
2514 __shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2515       u8 shift_amt)
2516 {
2517     if (shift_amt)
2518         emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2519              reg_b(dst), SHF_SC_L_SHF, shift_amt);
2520     wrp_zext(nfp_prog, meta, dst);
2521     return 0;
2522 }
2523 
2524 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2525 {
2526     const struct bpf_insn *insn = &meta->insn;
2527     u8 dst = insn->dst_reg * 2;
2528 
2529     return __shl_imm(nfp_prog, meta, dst, insn->imm);
2530 }
2531 
2532 static int shl_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2533 {
2534     const struct bpf_insn *insn = &meta->insn;
2535     u64 umin, umax;
2536     u8 dst, src;
2537 
2538     dst = insn->dst_reg * 2;
2539     umin = meta->umin_src;
2540     umax = meta->umax_src;
2541     if (umin == umax)
2542         return __shl_imm(nfp_prog, meta, dst, umin);
2543 
2544     src = insn->src_reg * 2;
2545     shl_reg64_lt32_low(nfp_prog, dst, src);
2546     wrp_zext(nfp_prog, meta, dst);
2547     return 0;
2548 }
2549 
2550 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2551 {
2552     const struct bpf_insn *insn = &meta->insn;
2553     u8 gpr = insn->dst_reg * 2;
2554 
2555     switch (insn->imm) {
2556     case 16:
2557         emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2558                   SHF_SC_R_ROT, 8);
2559         emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2560                   SHF_SC_R_SHF, 16);
2561 
2562         wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2563         break;
2564     case 32:
2565         wrp_end32(nfp_prog, reg_a(gpr), gpr);
2566         wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2567         break;
2568     case 64:
2569         wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2570 
2571         wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2572         wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2573         break;
2574     }
2575 
2576     return 0;
2577 }
2578 
2579 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2580 {
2581     struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2582     u32 imm_lo, imm_hi;
2583     u8 dst;
2584 
2585     dst = prev->insn.dst_reg * 2;
2586     imm_lo = prev->insn.imm;
2587     imm_hi = meta->insn.imm;
2588 
2589     wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2590 
2591     /* mov is always 1 insn, load imm may be two, so try to use mov */
2592     if (imm_hi == imm_lo)
2593         wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2594     else
2595         wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2596 
2597     return 0;
2598 }
2599 
2600 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2601 {
2602     meta->double_cb = imm_ld8_part2;
2603     return 0;
2604 }
2605 
2606 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2607 {
2608     return construct_data_ld(nfp_prog, meta, meta->insn.imm, 1);
2609 }
2610 
2611 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2612 {
2613     return construct_data_ld(nfp_prog, meta, meta->insn.imm, 2);
2614 }
2615 
2616 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2617 {
2618     return construct_data_ld(nfp_prog, meta, meta->insn.imm, 4);
2619 }
2620 
2621 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2622 {
2623     return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2624                      meta->insn.src_reg * 2, 1);
2625 }
2626 
2627 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2628 {
2629     return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2630                      meta->insn.src_reg * 2, 2);
2631 }
2632 
2633 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2634 {
2635     return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2636                      meta->insn.src_reg * 2, 4);
2637 }
2638 
2639 static int
2640 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2641           unsigned int size, unsigned int ptr_off)
2642 {
2643     return mem_op_stack(nfp_prog, meta, size, ptr_off,
2644                 meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2645                 true, wrp_lmem_load);
2646 }
2647 
2648 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2649                u8 size)
2650 {
2651     swreg dst = reg_both(meta->insn.dst_reg * 2);
2652 
2653     switch (meta->insn.off) {
2654     case offsetof(struct __sk_buff, len):
2655         if (size != sizeof_field(struct __sk_buff, len))
2656             return -EOPNOTSUPP;
2657         wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2658         break;
2659     case offsetof(struct __sk_buff, data):
2660         if (size != sizeof_field(struct __sk_buff, data))
2661             return -EOPNOTSUPP;
2662         wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2663         break;
2664     case offsetof(struct __sk_buff, data_end):
2665         if (size != sizeof_field(struct __sk_buff, data_end))
2666             return -EOPNOTSUPP;
2667         emit_alu(nfp_prog, dst,
2668              plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2669         break;
2670     default:
2671         return -EOPNOTSUPP;
2672     }
2673 
2674     wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2675 
2676     return 0;
2677 }
2678 
2679 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2680                u8 size)
2681 {
2682     swreg dst = reg_both(meta->insn.dst_reg * 2);
2683 
2684     switch (meta->insn.off) {
2685     case offsetof(struct xdp_md, data):
2686         if (size != sizeof_field(struct xdp_md, data))
2687             return -EOPNOTSUPP;
2688         wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2689         break;
2690     case offsetof(struct xdp_md, data_end):
2691         if (size != sizeof_field(struct xdp_md, data_end))
2692             return -EOPNOTSUPP;
2693         emit_alu(nfp_prog, dst,
2694              plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2695         break;
2696     default:
2697         return -EOPNOTSUPP;
2698     }
2699 
2700     wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2701 
2702     return 0;
2703 }
2704 
2705 static int
2706 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2707          unsigned int size)
2708 {
2709     swreg tmp_reg;
2710 
2711     tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2712 
2713     return data_ld_host_order_addr32(nfp_prog, meta, meta->insn.src_reg * 2,
2714                      tmp_reg, meta->insn.dst_reg * 2, size);
2715 }
2716 
2717 static int
2718 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2719          unsigned int size)
2720 {
2721     swreg tmp_reg;
2722 
2723     tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2724 
2725     return data_ld_host_order_addr40(nfp_prog, meta, meta->insn.src_reg * 2,
2726                      tmp_reg, meta->insn.dst_reg * 2, size);
2727 }
2728 
2729 static void
2730 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2731                struct nfp_insn_meta *meta)
2732 {
2733     s16 range_start = meta->pkt_cache.range_start;
2734     s16 range_end = meta->pkt_cache.range_end;
2735     swreg src_base, off;
2736     u8 xfer_num, len;
2737     bool indir;
2738 
2739     off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2740     src_base = reg_a(meta->insn.src_reg * 2);
2741     len = range_end - range_start;
2742     xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2743 
2744     indir = len > 8 * REG_WIDTH;
2745     /* Setup PREV_ALU for indirect mode. */
2746     if (indir)
2747         wrp_immed(nfp_prog, reg_none(),
2748               CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2749 
2750     /* Cache memory into transfer-in registers. */
2751     emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2752              off, xfer_num - 1, CMD_CTX_SWAP, indir);
2753 }
2754 
2755 static int
2756 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2757                      struct nfp_insn_meta *meta,
2758                      unsigned int size)
2759 {
2760     s16 range_start = meta->pkt_cache.range_start;
2761     s16 insn_off = meta->insn.off - range_start;
2762     swreg dst_lo, dst_hi, src_lo, src_mid;
2763     u8 dst_gpr = meta->insn.dst_reg * 2;
2764     u8 len_lo = size, len_mid = 0;
2765     u8 idx = insn_off / REG_WIDTH;
2766     u8 off = insn_off % REG_WIDTH;
2767 
2768     dst_hi = reg_both(dst_gpr + 1);
2769     dst_lo = reg_both(dst_gpr);
2770     src_lo = reg_xfer(idx);
2771 
2772     /* The read length could involve as many as three registers. */
2773     if (size > REG_WIDTH - off) {
2774         /* Calculate the part in the second register. */
2775         len_lo = REG_WIDTH - off;
2776         len_mid = size - len_lo;
2777 
2778         /* Calculate the part in the third register. */
2779         if (size > 2 * REG_WIDTH - off)
2780             len_mid = REG_WIDTH;
2781     }
2782 
2783     wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2784 
2785     if (!len_mid) {
2786         wrp_zext(nfp_prog, meta, dst_gpr);
2787         return 0;
2788     }
2789 
2790     src_mid = reg_xfer(idx + 1);
2791 
2792     if (size <= REG_WIDTH) {
2793         wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2794         wrp_zext(nfp_prog, meta, dst_gpr);
2795     } else {
2796         swreg src_hi = reg_xfer(idx + 2);
2797 
2798         wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2799                    REG_WIDTH - len_lo, len_lo);
2800         wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2801                 REG_WIDTH - len_lo);
2802         wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2803                    len_lo);
2804     }
2805 
2806     return 0;
2807 }
2808 
2809 static int
2810 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2811                    struct nfp_insn_meta *meta,
2812                    unsigned int size)
2813 {
2814     swreg dst_lo, dst_hi, src_lo;
2815     u8 dst_gpr, idx;
2816 
2817     idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2818     dst_gpr = meta->insn.dst_reg * 2;
2819     dst_hi = reg_both(dst_gpr + 1);
2820     dst_lo = reg_both(dst_gpr);
2821     src_lo = reg_xfer(idx);
2822 
2823     if (size < REG_WIDTH) {
2824         wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2825         wrp_zext(nfp_prog, meta, dst_gpr);
2826     } else if (size == REG_WIDTH) {
2827         wrp_mov(nfp_prog, dst_lo, src_lo);
2828         wrp_zext(nfp_prog, meta, dst_gpr);
2829     } else {
2830         swreg src_hi = reg_xfer(idx + 1);
2831 
2832         wrp_mov(nfp_prog, dst_lo, src_lo);
2833         wrp_mov(nfp_prog, dst_hi, src_hi);
2834     }
2835 
2836     return 0;
2837 }
2838 
2839 static int
2840 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2841                struct nfp_insn_meta *meta, unsigned int size)
2842 {
2843     u8 off = meta->insn.off - meta->pkt_cache.range_start;
2844 
2845     if (IS_ALIGNED(off, REG_WIDTH))
2846         return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2847 
2848     return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2849 }
2850 
2851 static int
2852 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2853     unsigned int size)
2854 {
2855     if (meta->ldst_gather_len)
2856         return nfp_cpp_memcpy(nfp_prog, meta);
2857 
2858     if (meta->ptr.type == PTR_TO_CTX) {
2859         if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2860             return mem_ldx_xdp(nfp_prog, meta, size);
2861         else
2862             return mem_ldx_skb(nfp_prog, meta, size);
2863     }
2864 
2865     if (meta->ptr.type == PTR_TO_PACKET) {
2866         if (meta->pkt_cache.range_end) {
2867             if (meta->pkt_cache.do_init)
2868                 mem_ldx_data_init_pktcache(nfp_prog, meta);
2869 
2870             return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2871         } else {
2872             return mem_ldx_data(nfp_prog, meta, size);
2873         }
2874     }
2875 
2876     if (meta->ptr.type == PTR_TO_STACK)
2877         return mem_ldx_stack(nfp_prog, meta, size,
2878                      meta->ptr.off + meta->ptr.var_off.value);
2879 
2880     if (meta->ptr.type == PTR_TO_MAP_VALUE)
2881         return mem_ldx_emem(nfp_prog, meta, size);
2882 
2883     return -EOPNOTSUPP;
2884 }
2885 
2886 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2887 {
2888     return mem_ldx(nfp_prog, meta, 1);
2889 }
2890 
2891 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2892 {
2893     return mem_ldx(nfp_prog, meta, 2);
2894 }
2895 
2896 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2897 {
2898     return mem_ldx(nfp_prog, meta, 4);
2899 }
2900 
2901 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2902 {
2903     return mem_ldx(nfp_prog, meta, 8);
2904 }
2905 
2906 static int
2907 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2908         unsigned int size)
2909 {
2910     u64 imm = meta->insn.imm; /* sign extend */
2911     swreg off_reg;
2912 
2913     off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2914 
2915     return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2916                   imm, size);
2917 }
2918 
2919 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2920           unsigned int size)
2921 {
2922     if (meta->ptr.type == PTR_TO_PACKET)
2923         return mem_st_data(nfp_prog, meta, size);
2924 
2925     return -EOPNOTSUPP;
2926 }
2927 
2928 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2929 {
2930     return mem_st(nfp_prog, meta, 1);
2931 }
2932 
2933 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2934 {
2935     return mem_st(nfp_prog, meta, 2);
2936 }
2937 
2938 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2939 {
2940     return mem_st(nfp_prog, meta, 4);
2941 }
2942 
2943 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2944 {
2945     return mem_st(nfp_prog, meta, 8);
2946 }
2947 
2948 static int
2949 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2950          unsigned int size)
2951 {
2952     swreg off_reg;
2953 
2954     off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2955 
2956     return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2957                    meta->insn.src_reg * 2, size);
2958 }
2959 
2960 static int
2961 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2962           unsigned int size, unsigned int ptr_off)
2963 {
2964     return mem_op_stack(nfp_prog, meta, size, ptr_off,
2965                 meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2966                 false, wrp_lmem_store);
2967 }
2968 
2969 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2970 {
2971     switch (meta->insn.off) {
2972     case offsetof(struct xdp_md, rx_queue_index):
2973         return nfp_queue_select(nfp_prog, meta);
2974     }
2975 
2976     WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2977     return -EOPNOTSUPP;
2978 }
2979 
2980 static int
2981 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2982     unsigned int size)
2983 {
2984     if (meta->ptr.type == PTR_TO_PACKET)
2985         return mem_stx_data(nfp_prog, meta, size);
2986 
2987     if (meta->ptr.type == PTR_TO_STACK)
2988         return mem_stx_stack(nfp_prog, meta, size,
2989                      meta->ptr.off + meta->ptr.var_off.value);
2990 
2991     return -EOPNOTSUPP;
2992 }
2993 
2994 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2995 {
2996     return mem_stx(nfp_prog, meta, 1);
2997 }
2998 
2999 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3000 {
3001     return mem_stx(nfp_prog, meta, 2);
3002 }
3003 
3004 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3005 {
3006     if (meta->ptr.type == PTR_TO_CTX)
3007         if (nfp_prog->type == BPF_PROG_TYPE_XDP)
3008             return mem_stx_xdp(nfp_prog, meta);
3009     return mem_stx(nfp_prog, meta, 4);
3010 }
3011 
3012 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3013 {
3014     return mem_stx(nfp_prog, meta, 8);
3015 }
3016 
3017 static int
3018 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
3019 {
3020     u8 dst_gpr = meta->insn.dst_reg * 2;
3021     u8 src_gpr = meta->insn.src_reg * 2;
3022     unsigned int full_add, out;
3023     swreg addra, addrb, off;
3024 
3025     off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
3026 
3027     /* We can fit 16 bits into command immediate, if we know the immediate
3028      * is guaranteed to either always or never fit into 16 bit we only
3029      * generate code to handle that particular case, otherwise generate
3030      * code for both.
3031      */
3032     out = nfp_prog_current_offset(nfp_prog);
3033     full_add = nfp_prog_current_offset(nfp_prog);
3034 
3035     if (meta->insn.off) {
3036         out += 2;
3037         full_add += 2;
3038     }
3039     if (meta->xadd_maybe_16bit) {
3040         out += 3;
3041         full_add += 3;
3042     }
3043     if (meta->xadd_over_16bit)
3044         out += 2 + is64;
3045     if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3046         out += 5;
3047         full_add += 5;
3048     }
3049 
3050     /* Generate the branch for choosing add_imm vs add */
3051     if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3052         swreg max_imm = imm_a(nfp_prog);
3053 
3054         wrp_immed(nfp_prog, max_imm, 0xffff);
3055         emit_alu(nfp_prog, reg_none(),
3056              max_imm, ALU_OP_SUB, reg_b(src_gpr));
3057         emit_alu(nfp_prog, reg_none(),
3058              reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
3059         emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
3060         /* defer for add */
3061     }
3062 
3063     /* If insn has an offset add to the address */
3064     if (!meta->insn.off) {
3065         addra = reg_a(dst_gpr);
3066         addrb = reg_b(dst_gpr + 1);
3067     } else {
3068         emit_alu(nfp_prog, imma_a(nfp_prog),
3069              reg_a(dst_gpr), ALU_OP_ADD, off);
3070         emit_alu(nfp_prog, imma_b(nfp_prog),
3071              reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
3072         addra = imma_a(nfp_prog);
3073         addrb = imma_b(nfp_prog);
3074     }
3075 
3076     /* Generate the add_imm if 16 bits are possible */
3077     if (meta->xadd_maybe_16bit) {
3078         swreg prev_alu = imm_a(nfp_prog);
3079 
3080         wrp_immed(nfp_prog, prev_alu,
3081               FIELD_PREP(CMD_OVE_DATA, 2) |
3082               CMD_OVE_LEN |
3083               FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
3084         wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
3085         emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
3086                    addra, addrb, 0, CMD_CTX_NO_SWAP);
3087 
3088         if (meta->xadd_over_16bit)
3089             emit_br(nfp_prog, BR_UNC, out, 0);
3090     }
3091 
3092     if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
3093         return -EINVAL;
3094 
3095     /* Generate the add if 16 bits are not guaranteed */
3096     if (meta->xadd_over_16bit) {
3097         emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
3098              addra, addrb, is64 << 2,
3099              is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
3100 
3101         wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
3102         if (is64)
3103             wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
3104     }
3105 
3106     if (!nfp_prog_confirm_current_offset(nfp_prog, out))
3107         return -EINVAL;
3108 
3109     return 0;
3110 }
3111 
3112 static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3113 {
3114     if (meta->insn.imm != BPF_ADD)
3115         return -EOPNOTSUPP;
3116 
3117     return mem_xadd(nfp_prog, meta, false);
3118 }
3119 
3120 static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3121 {
3122     if (meta->insn.imm != BPF_ADD)
3123         return -EOPNOTSUPP;
3124 
3125     return mem_xadd(nfp_prog, meta, true);
3126 }
3127 
3128 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3129 {
3130     emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
3131 
3132     return 0;
3133 }
3134 
3135 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3136 {
3137     const struct bpf_insn *insn = &meta->insn;
3138     u64 imm = insn->imm; /* sign extend */
3139     swreg or1, or2, tmp_reg;
3140 
3141     or1 = reg_a(insn->dst_reg * 2);
3142     or2 = reg_b(insn->dst_reg * 2 + 1);
3143 
3144     if (imm & ~0U) {
3145         tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3146         emit_alu(nfp_prog, imm_a(nfp_prog),
3147              reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3148         or1 = imm_a(nfp_prog);
3149     }
3150 
3151     if (imm >> 32) {
3152         tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3153         emit_alu(nfp_prog, imm_b(nfp_prog),
3154              reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3155         or2 = imm_b(nfp_prog);
3156     }
3157 
3158     emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3159     emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3160 
3161     return 0;
3162 }
3163 
3164 static int jeq32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3165 {
3166     const struct bpf_insn *insn = &meta->insn;
3167     swreg tmp_reg;
3168 
3169     tmp_reg = ur_load_imm_any(nfp_prog, insn->imm, imm_b(nfp_prog));
3170     emit_alu(nfp_prog, reg_none(),
3171          reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3172     emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3173 
3174     return 0;
3175 }
3176 
3177 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3178 {
3179     const struct bpf_insn *insn = &meta->insn;
3180     u64 imm = insn->imm; /* sign extend */
3181     u8 dst_gpr = insn->dst_reg * 2;
3182     swreg tmp_reg;
3183 
3184     tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3185     emit_alu(nfp_prog, imm_b(nfp_prog),
3186          reg_a(dst_gpr), ALU_OP_AND, tmp_reg);
3187     /* Upper word of the mask can only be 0 or ~0 from sign extension,
3188      * so either ignore it or OR the whole thing in.
3189      */
3190     if (is_mbpf_jmp64(meta) && imm >> 32) {
3191         emit_alu(nfp_prog, reg_none(),
3192              reg_a(dst_gpr + 1), ALU_OP_OR, imm_b(nfp_prog));
3193     }
3194     emit_br(nfp_prog, BR_BNE, insn->off, 0);
3195 
3196     return 0;
3197 }
3198 
3199 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3200 {
3201     const struct bpf_insn *insn = &meta->insn;
3202     u64 imm = insn->imm; /* sign extend */
3203     bool is_jmp32 = is_mbpf_jmp32(meta);
3204     swreg tmp_reg;
3205 
3206     if (!imm) {
3207         if (is_jmp32)
3208             emit_alu(nfp_prog, reg_none(), reg_none(), ALU_OP_NONE,
3209                  reg_b(insn->dst_reg * 2));
3210         else
3211             emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3212                  ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3213         emit_br(nfp_prog, BR_BNE, insn->off, 0);
3214         return 0;
3215     }
3216 
3217     tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3218     emit_alu(nfp_prog, reg_none(),
3219          reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3220     emit_br(nfp_prog, BR_BNE, insn->off, 0);
3221 
3222     if (is_jmp32)
3223         return 0;
3224 
3225     tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3226     emit_alu(nfp_prog, reg_none(),
3227          reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3228     emit_br(nfp_prog, BR_BNE, insn->off, 0);
3229 
3230     return 0;
3231 }
3232 
3233 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3234 {
3235     const struct bpf_insn *insn = &meta->insn;
3236 
3237     emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3238          ALU_OP_XOR, reg_b(insn->src_reg * 2));
3239     if (is_mbpf_jmp64(meta)) {
3240         emit_alu(nfp_prog, imm_b(nfp_prog),
3241              reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR,
3242              reg_b(insn->src_reg * 2 + 1));
3243         emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR,
3244              imm_b(nfp_prog));
3245     }
3246     emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3247 
3248     return 0;
3249 }
3250 
3251 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3252 {
3253     return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3254 }
3255 
3256 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3257 {
3258     return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3259 }
3260 
3261 static int
3262 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3263 {
3264     u32 ret_tgt, stack_depth, offset_br;
3265     swreg tmp_reg;
3266 
3267     stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
3268     /* Space for saving the return address is accounted for by the callee,
3269      * so stack_depth can be zero for the main function.
3270      */
3271     if (stack_depth) {
3272         tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3273                       stack_imm(nfp_prog));
3274         emit_alu(nfp_prog, stack_reg(nfp_prog),
3275              stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
3276         emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3277                 NFP_CSR_ACT_LM_ADDR0);
3278     }
3279 
3280     /* Two cases for jumping to the callee:
3281      *
3282      * - If callee uses and needs to save R6~R9 then:
3283      *     1. Put the start offset of the callee into imm_b(). This will
3284      *        require a fixup step, as we do not necessarily know this
3285      *        address yet.
3286      *     2. Put the return address from the callee to the caller into
3287      *        register ret_reg().
3288      *     3. (After defer slots are consumed) Jump to the subroutine that
3289      *        pushes the registers to the stack.
3290      *   The subroutine acts as a trampoline, and returns to the address in
3291      *   imm_b(), i.e. jumps to the callee.
3292      *
3293      * - If callee does not need to save R6~R9 then just load return
3294      *   address to the caller in ret_reg(), and jump to the callee
3295      *   directly.
3296      *
3297      * Using ret_reg() to pass the return address to the callee is set here
3298      * as a convention. The callee can then push this address onto its
3299      * stack frame in its prologue. The advantages of passing the return
3300      * address through ret_reg(), instead of pushing it to the stack right
3301      * here, are the following:
3302      * - It looks cleaner.
3303      * - If the called function is called multiple time, we get a lower
3304      *   program size.
3305      * - We save two no-op instructions that should be added just before
3306      *   the emit_br() when stack depth is not null otherwise.
3307      * - If we ever find a register to hold the return address during whole
3308      *   execution of the callee, we will not have to push the return
3309      *   address to the stack for leaf functions.
3310      */
3311     if (!meta->jmp_dst) {
3312         pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
3313         return -ELOOP;
3314     }
3315     if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
3316         ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
3317         emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
3318                  RELO_BR_GO_CALL_PUSH_REGS);
3319         offset_br = nfp_prog_current_offset(nfp_prog);
3320         wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
3321     } else {
3322         ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
3323         emit_br(nfp_prog, BR_UNC, meta->insn.imm, 1);
3324         offset_br = nfp_prog_current_offset(nfp_prog);
3325     }
3326     wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
3327 
3328     if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
3329         return -EINVAL;
3330 
3331     if (stack_depth) {
3332         tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3333                       stack_imm(nfp_prog));
3334         emit_alu(nfp_prog, stack_reg(nfp_prog),
3335              stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
3336         emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3337                 NFP_CSR_ACT_LM_ADDR0);
3338         wrp_nops(nfp_prog, 3);
3339     }
3340 
3341     meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
3342     meta->num_insns_after_br -= offset_br;
3343 
3344     return 0;
3345 }
3346 
3347 static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3348 {
3349     switch (meta->insn.imm) {
3350     case BPF_FUNC_xdp_adjust_head:
3351         return adjust_head(nfp_prog, meta);
3352     case BPF_FUNC_xdp_adjust_tail:
3353         return adjust_tail(nfp_prog, meta);
3354     case BPF_FUNC_map_lookup_elem:
3355     case BPF_FUNC_map_update_elem:
3356     case BPF_FUNC_map_delete_elem:
3357         return map_call_stack_common(nfp_prog, meta);
3358     case BPF_FUNC_get_prandom_u32:
3359         return nfp_get_prandom_u32(nfp_prog, meta);
3360     case BPF_FUNC_perf_event_output:
3361         return nfp_perf_event_output(nfp_prog, meta);
3362     default:
3363         WARN_ONCE(1, "verifier allowed unsupported function\n");
3364         return -EOPNOTSUPP;
3365     }
3366 }
3367 
3368 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3369 {
3370     if (is_mbpf_pseudo_call(meta))
3371         return bpf_to_bpf_call(nfp_prog, meta);
3372     else
3373         return helper_call(nfp_prog, meta);
3374 }
3375 
3376 static bool nfp_is_main_function(struct nfp_insn_meta *meta)
3377 {
3378     return meta->subprog_idx == 0;
3379 }
3380 
3381 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3382 {
3383     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3384 
3385     return 0;
3386 }
3387 
3388 static int
3389 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3390 {
3391     if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
3392         /* Pop R6~R9 to the stack via related subroutine.
3393          * We loaded the return address to the caller into ret_reg().
3394          * This means that the subroutine does not come back here, we
3395          * make it jump back to the subprogram caller directly!
3396          */
3397         emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
3398                  RELO_BR_GO_CALL_POP_REGS);
3399         /* Pop return address from the stack. */
3400         wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3401     } else {
3402         /* Pop return address from the stack. */
3403         wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3404         /* Jump back to caller if no callee-saved registers were used
3405          * by the subprogram.
3406          */
3407         emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
3408     }
3409 
3410     return 0;
3411 }
3412 
3413 static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3414 {
3415     if (nfp_is_main_function(meta))
3416         return goto_out(nfp_prog, meta);
3417     else
3418         return nfp_subprog_epilogue(nfp_prog, meta);
3419 }
3420 
3421 static const instr_cb_t instr_cb[256] = {
3422     [BPF_ALU64 | BPF_MOV | BPF_X] = mov_reg64,
3423     [BPF_ALU64 | BPF_MOV | BPF_K] = mov_imm64,
3424     [BPF_ALU64 | BPF_XOR | BPF_X] = xor_reg64,
3425     [BPF_ALU64 | BPF_XOR | BPF_K] = xor_imm64,
3426     [BPF_ALU64 | BPF_AND | BPF_X] = and_reg64,
3427     [BPF_ALU64 | BPF_AND | BPF_K] = and_imm64,
3428     [BPF_ALU64 | BPF_OR | BPF_X] =  or_reg64,
3429     [BPF_ALU64 | BPF_OR | BPF_K] =  or_imm64,
3430     [BPF_ALU64 | BPF_ADD | BPF_X] = add_reg64,
3431     [BPF_ALU64 | BPF_ADD | BPF_K] = add_imm64,
3432     [BPF_ALU64 | BPF_SUB | BPF_X] = sub_reg64,
3433     [BPF_ALU64 | BPF_SUB | BPF_K] = sub_imm64,
3434     [BPF_ALU64 | BPF_MUL | BPF_X] = mul_reg64,
3435     [BPF_ALU64 | BPF_MUL | BPF_K] = mul_imm64,
3436     [BPF_ALU64 | BPF_DIV | BPF_X] = div_reg64,
3437     [BPF_ALU64 | BPF_DIV | BPF_K] = div_imm64,
3438     [BPF_ALU64 | BPF_NEG] =     neg_reg64,
3439     [BPF_ALU64 | BPF_LSH | BPF_X] = shl_reg64,
3440     [BPF_ALU64 | BPF_LSH | BPF_K] = shl_imm64,
3441     [BPF_ALU64 | BPF_RSH | BPF_X] = shr_reg64,
3442     [BPF_ALU64 | BPF_RSH | BPF_K] = shr_imm64,
3443     [BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3444     [BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3445     [BPF_ALU | BPF_MOV | BPF_X] =   mov_reg,
3446     [BPF_ALU | BPF_MOV | BPF_K] =   mov_imm,
3447     [BPF_ALU | BPF_XOR | BPF_X] =   xor_reg,
3448     [BPF_ALU | BPF_XOR | BPF_K] =   xor_imm,
3449     [BPF_ALU | BPF_AND | BPF_X] =   and_reg,
3450     [BPF_ALU | BPF_AND | BPF_K] =   and_imm,
3451     [BPF_ALU | BPF_OR | BPF_X] =    or_reg,
3452     [BPF_ALU | BPF_OR | BPF_K] =    or_imm,
3453     [BPF_ALU | BPF_ADD | BPF_X] =   add_reg,
3454     [BPF_ALU | BPF_ADD | BPF_K] =   add_imm,
3455     [BPF_ALU | BPF_SUB | BPF_X] =   sub_reg,
3456     [BPF_ALU | BPF_SUB | BPF_K] =   sub_imm,
3457     [BPF_ALU | BPF_MUL | BPF_X] =   mul_reg,
3458     [BPF_ALU | BPF_MUL | BPF_K] =   mul_imm,
3459     [BPF_ALU | BPF_DIV | BPF_X] =   div_reg,
3460     [BPF_ALU | BPF_DIV | BPF_K] =   div_imm,
3461     [BPF_ALU | BPF_NEG] =       neg_reg,
3462     [BPF_ALU | BPF_LSH | BPF_X] =   shl_reg,
3463     [BPF_ALU | BPF_LSH | BPF_K] =   shl_imm,
3464     [BPF_ALU | BPF_RSH | BPF_X] =   shr_reg,
3465     [BPF_ALU | BPF_RSH | BPF_K] =   shr_imm,
3466     [BPF_ALU | BPF_ARSH | BPF_X] =  ashr_reg,
3467     [BPF_ALU | BPF_ARSH | BPF_K] =  ashr_imm,
3468     [BPF_ALU | BPF_END | BPF_X] =   end_reg32,
3469     [BPF_LD | BPF_IMM | BPF_DW] =   imm_ld8,
3470     [BPF_LD | BPF_ABS | BPF_B] =    data_ld1,
3471     [BPF_LD | BPF_ABS | BPF_H] =    data_ld2,
3472     [BPF_LD | BPF_ABS | BPF_W] =    data_ld4,
3473     [BPF_LD | BPF_IND | BPF_B] =    data_ind_ld1,
3474     [BPF_LD | BPF_IND | BPF_H] =    data_ind_ld2,
3475     [BPF_LD | BPF_IND | BPF_W] =    data_ind_ld4,
3476     [BPF_LDX | BPF_MEM | BPF_B] =   mem_ldx1,
3477     [BPF_LDX | BPF_MEM | BPF_H] =   mem_ldx2,
3478     [BPF_LDX | BPF_MEM | BPF_W] =   mem_ldx4,
3479     [BPF_LDX | BPF_MEM | BPF_DW] =  mem_ldx8,
3480     [BPF_STX | BPF_MEM | BPF_B] =   mem_stx1,
3481     [BPF_STX | BPF_MEM | BPF_H] =   mem_stx2,
3482     [BPF_STX | BPF_MEM | BPF_W] =   mem_stx4,
3483     [BPF_STX | BPF_MEM | BPF_DW] =  mem_stx8,
3484     [BPF_STX | BPF_ATOMIC | BPF_W] =    mem_atomic4,
3485     [BPF_STX | BPF_ATOMIC | BPF_DW] =   mem_atomic8,
3486     [BPF_ST | BPF_MEM | BPF_B] =    mem_st1,
3487     [BPF_ST | BPF_MEM | BPF_H] =    mem_st2,
3488     [BPF_ST | BPF_MEM | BPF_W] =    mem_st4,
3489     [BPF_ST | BPF_MEM | BPF_DW] =   mem_st8,
3490     [BPF_JMP | BPF_JA | BPF_K] =    jump,
3491     [BPF_JMP | BPF_JEQ | BPF_K] =   jeq_imm,
3492     [BPF_JMP | BPF_JGT | BPF_K] =   cmp_imm,
3493     [BPF_JMP | BPF_JGE | BPF_K] =   cmp_imm,
3494     [BPF_JMP | BPF_JLT | BPF_K] =   cmp_imm,
3495     [BPF_JMP | BPF_JLE | BPF_K] =   cmp_imm,
3496     [BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3497     [BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3498     [BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3499     [BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3500     [BPF_JMP | BPF_JSET | BPF_K] =  jset_imm,
3501     [BPF_JMP | BPF_JNE | BPF_K] =   jne_imm,
3502     [BPF_JMP | BPF_JEQ | BPF_X] =   jeq_reg,
3503     [BPF_JMP | BPF_JGT | BPF_X] =   cmp_reg,
3504     [BPF_JMP | BPF_JGE | BPF_X] =   cmp_reg,
3505     [BPF_JMP | BPF_JLT | BPF_X] =   cmp_reg,
3506     [BPF_JMP | BPF_JLE | BPF_X] =   cmp_reg,
3507     [BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3508     [BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3509     [BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3510     [BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3511     [BPF_JMP | BPF_JSET | BPF_X] =  jset_reg,
3512     [BPF_JMP | BPF_JNE | BPF_X] =   jne_reg,
3513     [BPF_JMP32 | BPF_JEQ | BPF_K] = jeq32_imm,
3514     [BPF_JMP32 | BPF_JGT | BPF_K] = cmp_imm,
3515     [BPF_JMP32 | BPF_JGE | BPF_K] = cmp_imm,
3516     [BPF_JMP32 | BPF_JLT | BPF_K] = cmp_imm,
3517     [BPF_JMP32 | BPF_JLE | BPF_K] = cmp_imm,
3518     [BPF_JMP32 | BPF_JSGT | BPF_K] =cmp_imm,
3519     [BPF_JMP32 | BPF_JSGE | BPF_K] =cmp_imm,
3520     [BPF_JMP32 | BPF_JSLT | BPF_K] =cmp_imm,
3521     [BPF_JMP32 | BPF_JSLE | BPF_K] =cmp_imm,
3522     [BPF_JMP32 | BPF_JSET | BPF_K] =jset_imm,
3523     [BPF_JMP32 | BPF_JNE | BPF_K] = jne_imm,
3524     [BPF_JMP32 | BPF_JEQ | BPF_X] = jeq_reg,
3525     [BPF_JMP32 | BPF_JGT | BPF_X] = cmp_reg,
3526     [BPF_JMP32 | BPF_JGE | BPF_X] = cmp_reg,
3527     [BPF_JMP32 | BPF_JLT | BPF_X] = cmp_reg,
3528     [BPF_JMP32 | BPF_JLE | BPF_X] = cmp_reg,
3529     [BPF_JMP32 | BPF_JSGT | BPF_X] =cmp_reg,
3530     [BPF_JMP32 | BPF_JSGE | BPF_X] =cmp_reg,
3531     [BPF_JMP32 | BPF_JSLT | BPF_X] =cmp_reg,
3532     [BPF_JMP32 | BPF_JSLE | BPF_X] =cmp_reg,
3533     [BPF_JMP32 | BPF_JSET | BPF_X] =jset_reg,
3534     [BPF_JMP32 | BPF_JNE | BPF_X] = jne_reg,
3535     [BPF_JMP | BPF_CALL] =      call,
3536     [BPF_JMP | BPF_EXIT] =      jmp_exit,
3537 };
3538 
3539 /* --- Assembler logic --- */
3540 static int
3541 nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
3542              struct nfp_insn_meta *jmp_dst, u32 br_idx)
3543 {
3544     if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
3545         pr_err("BUG: failed to fix up callee register saving\n");
3546         return -EINVAL;
3547     }
3548 
3549     immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
3550 
3551     return 0;
3552 }
3553 
3554 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3555 {
3556     struct nfp_insn_meta *meta, *jmp_dst;
3557     u32 idx, br_idx;
3558     int err;
3559 
3560     list_for_each_entry(meta, &nfp_prog->insns, l) {
3561         if (meta->flags & FLAG_INSN_SKIP_MASK)
3562             continue;
3563         if (!is_mbpf_jmp(meta))
3564             continue;
3565         if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
3566             !nfp_is_main_function(meta))
3567             continue;
3568         if (is_mbpf_helper_call(meta))
3569             continue;
3570 
3571         if (list_is_last(&meta->l, &nfp_prog->insns))
3572             br_idx = nfp_prog->last_bpf_off;
3573         else
3574             br_idx = list_next_entry(meta, l)->off - 1;
3575 
3576         /* For BPF-to-BPF function call, a stack adjustment sequence is
3577          * generated after the return instruction. Therefore, we must
3578          * withdraw the length of this sequence to have br_idx pointing
3579          * to where the "branch" NFP instruction is expected to be.
3580          */
3581         if (is_mbpf_pseudo_call(meta))
3582             br_idx -= meta->num_insns_after_br;
3583 
3584         if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3585             pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3586                    br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3587             return -ELOOP;
3588         }
3589 
3590         if (meta->insn.code == (BPF_JMP | BPF_EXIT))
3591             continue;
3592 
3593         /* Leave special branches for later */
3594         if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3595             RELO_BR_REL && !is_mbpf_pseudo_call(meta))
3596             continue;
3597 
3598         if (!meta->jmp_dst) {
3599             pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3600             return -ELOOP;
3601         }
3602 
3603         jmp_dst = meta->jmp_dst;
3604 
3605         if (jmp_dst->flags & FLAG_INSN_SKIP_PREC_DEPENDENT) {
3606             pr_err("Branch landing on removed instruction!!\n");
3607             return -ELOOP;
3608         }
3609 
3610         if (is_mbpf_pseudo_call(meta) &&
3611             nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
3612             err = nfp_fixup_immed_relo(nfp_prog, meta,
3613                            jmp_dst, br_idx);
3614             if (err)
3615                 return err;
3616         }
3617 
3618         if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3619             RELO_BR_REL)
3620             continue;
3621 
3622         for (idx = meta->off; idx <= br_idx; idx++) {
3623             if (!nfp_is_br(nfp_prog->prog[idx]))
3624                 continue;
3625             br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3626         }
3627     }
3628 
3629     return 0;
3630 }
3631 
3632 static void nfp_intro(struct nfp_prog *nfp_prog)
3633 {
3634     wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3635     emit_alu(nfp_prog, plen_reg(nfp_prog),
3636          plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3637 }
3638 
3639 static void
3640 nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3641 {
3642     /* Save return address into the stack. */
3643     wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
3644 }
3645 
3646 static void
3647 nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3648 {
3649     unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
3650 
3651     nfp_prog->stack_frame_depth = round_up(depth, 4);
3652     nfp_subprog_prologue(nfp_prog, meta);
3653 }
3654 
3655 bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
3656 {
3657     return meta->flags & FLAG_INSN_IS_SUBPROG_START;
3658 }
3659 
3660 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3661 {
3662     /* TC direct-action mode:
3663      *   0,1   ok        NOT SUPPORTED[1]
3664      *   2   drop  0x22 -> drop,  count as stat1
3665      *   4,5 nuke  0x02 -> drop
3666      *   7  redir  0x44 -> redir, count as stat2
3667      *   * unspec  0x11 -> pass,  count as stat0
3668      *
3669      * [1] We can't support OK and RECLASSIFY because we can't tell TC
3670      *     the exact decision made.  We are forced to support UNSPEC
3671      *     to handle aborts so that's the only one we handle for passing
3672      *     packets up the stack.
3673      */
3674     /* Target for aborts */
3675     nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3676 
3677     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3678 
3679     wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3680     emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3681 
3682     /* Target for normal exits */
3683     nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3684 
3685     /* if R0 > 7 jump to abort */
3686     emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3687     emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3688     wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3689 
3690     wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3691     wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3692 
3693     emit_shf(nfp_prog, reg_a(1),
3694          reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3695 
3696     emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3697     emit_shf(nfp_prog, reg_a(2),
3698          reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3699 
3700     emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3701     emit_shf(nfp_prog, reg_b(2),
3702          reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3703 
3704     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3705 
3706     emit_shf(nfp_prog, reg_b(2),
3707          reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3708     emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3709 }
3710 
3711 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3712 {
3713     /* XDP return codes:
3714      *   0 aborted  0x82 -> drop,  count as stat3
3715      *   1    drop  0x22 -> drop,  count as stat1
3716      *   2    pass  0x11 -> pass,  count as stat0
3717      *   3      tx  0x44 -> redir, count as stat2
3718      *   * unknown  0x82 -> drop,  count as stat3
3719      */
3720     /* Target for aborts */
3721     nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3722 
3723     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3724 
3725     wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3726     emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3727 
3728     /* Target for normal exits */
3729     nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3730 
3731     /* if R0 > 3 jump to abort */
3732     emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3733     emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3734 
3735     wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3736 
3737     emit_shf(nfp_prog, reg_a(1),
3738          reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3739 
3740     emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3741     emit_shf(nfp_prog, reg_b(2),
3742          reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3743 
3744     emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3745 
3746     wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3747     emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3748 }
3749 
3750 static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
3751 {
3752     unsigned int idx;
3753 
3754     for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
3755         if (nfp_prog->subprog[idx].needs_reg_push)
3756             return true;
3757 
3758     return false;
3759 }
3760 
3761 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
3762 {
3763     u8 reg;
3764 
3765     /* Subroutine: Save all callee saved registers (R6 ~ R9).
3766      * imm_b() holds the return address.
3767      */
3768     nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
3769     for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3770         u8 adj = (reg - BPF_REG_0) * 2;
3771         u8 idx = (reg - BPF_REG_6) * 2;
3772 
3773         /* The first slot in the stack frame is used to push the return
3774          * address in bpf_to_bpf_call(), start just after.
3775          */
3776         wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
3777 
3778         if (reg == BPF_REG_8)
3779             /* Prepare to jump back, last 3 insns use defer slots */
3780             emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
3781 
3782         wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
3783     }
3784 }
3785 
3786 static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
3787 {
3788     u8 reg;
3789 
3790     /* Subroutine: Restore all callee saved registers (R6 ~ R9).
3791      * ret_reg() holds the return address.
3792      */
3793     nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
3794     for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3795         u8 adj = (reg - BPF_REG_0) * 2;
3796         u8 idx = (reg - BPF_REG_6) * 2;
3797 
3798         /* The first slot in the stack frame holds the return address,
3799          * start popping just after that.
3800          */
3801         wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
3802 
3803         if (reg == BPF_REG_8)
3804             /* Prepare to jump back, last 3 insns use defer slots */
3805             emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
3806 
3807         wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
3808     }
3809 }
3810 
3811 static void nfp_outro(struct nfp_prog *nfp_prog)
3812 {
3813     switch (nfp_prog->type) {
3814     case BPF_PROG_TYPE_SCHED_CLS:
3815         nfp_outro_tc_da(nfp_prog);
3816         break;
3817     case BPF_PROG_TYPE_XDP:
3818         nfp_outro_xdp(nfp_prog);
3819         break;
3820     default:
3821         WARN_ON(1);
3822     }
3823 
3824     if (!nfp_prog_needs_callee_reg_save(nfp_prog))
3825         return;
3826 
3827     nfp_push_callee_registers(nfp_prog);
3828     nfp_pop_callee_registers(nfp_prog);
3829 }
3830 
3831 static int nfp_translate(struct nfp_prog *nfp_prog)
3832 {
3833     struct nfp_insn_meta *meta;
3834     unsigned int depth;
3835     int err;
3836 
3837     depth = nfp_prog->subprog[0].stack_depth;
3838     nfp_prog->stack_frame_depth = round_up(depth, 4);
3839 
3840     nfp_intro(nfp_prog);
3841     if (nfp_prog->error)
3842         return nfp_prog->error;
3843 
3844     list_for_each_entry(meta, &nfp_prog->insns, l) {
3845         instr_cb_t cb = instr_cb[meta->insn.code];
3846 
3847         meta->off = nfp_prog_current_offset(nfp_prog);
3848 
3849         if (nfp_is_subprog_start(meta)) {
3850             nfp_start_subprog(nfp_prog, meta);
3851             if (nfp_prog->error)
3852                 return nfp_prog->error;
3853         }
3854 
3855         if (meta->flags & FLAG_INSN_SKIP_MASK) {
3856             nfp_prog->n_translated++;
3857             continue;
3858         }
3859 
3860         if (nfp_meta_has_prev(nfp_prog, meta) &&
3861             nfp_meta_prev(meta)->double_cb)
3862             cb = nfp_meta_prev(meta)->double_cb;
3863         if (!cb)
3864             return -ENOENT;
3865         err = cb(nfp_prog, meta);
3866         if (err)
3867             return err;
3868         if (nfp_prog->error)
3869             return nfp_prog->error;
3870 
3871         nfp_prog->n_translated++;
3872     }
3873 
3874     nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3875 
3876     nfp_outro(nfp_prog);
3877     if (nfp_prog->error)
3878         return nfp_prog->error;
3879 
3880     wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3881     if (nfp_prog->error)
3882         return nfp_prog->error;
3883 
3884     return nfp_fixup_branches(nfp_prog);
3885 }
3886 
3887 /* --- Optimizations --- */
3888 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3889 {
3890     struct nfp_insn_meta *meta;
3891 
3892     list_for_each_entry(meta, &nfp_prog->insns, l) {
3893         struct bpf_insn insn = meta->insn;
3894 
3895         /* Programs converted from cBPF start with register xoring */
3896         if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3897             insn.src_reg == insn.dst_reg)
3898             continue;
3899 
3900         /* Programs start with R6 = R1 but we ignore the skb pointer */
3901         if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3902             insn.src_reg == 1 && insn.dst_reg == 6)
3903             meta->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3904 
3905         /* Return as soon as something doesn't match */
3906         if (!(meta->flags & FLAG_INSN_SKIP_MASK))
3907             return;
3908     }
3909 }
3910 
3911 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3912  * convert add/sub of a negative number into a sub/add of a positive one.
3913  */
3914 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3915 {
3916     struct nfp_insn_meta *meta;
3917 
3918     list_for_each_entry(meta, &nfp_prog->insns, l) {
3919         struct bpf_insn insn = meta->insn;
3920 
3921         if (meta->flags & FLAG_INSN_SKIP_MASK)
3922             continue;
3923 
3924         if (!is_mbpf_alu(meta) && !is_mbpf_jmp(meta))
3925             continue;
3926         if (BPF_SRC(insn.code) != BPF_K)
3927             continue;
3928         if (insn.imm >= 0)
3929             continue;
3930 
3931         if (is_mbpf_jmp(meta)) {
3932             switch (BPF_OP(insn.code)) {
3933             case BPF_JGE:
3934             case BPF_JSGE:
3935             case BPF_JLT:
3936             case BPF_JSLT:
3937                 meta->jump_neg_op = true;
3938                 break;
3939             default:
3940                 continue;
3941             }
3942         } else {
3943             if (BPF_OP(insn.code) == BPF_ADD)
3944                 insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3945             else if (BPF_OP(insn.code) == BPF_SUB)
3946                 insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3947             else
3948                 continue;
3949 
3950             meta->insn.code = insn.code | BPF_K;
3951         }
3952 
3953         meta->insn.imm = -insn.imm;
3954     }
3955 }
3956 
3957 /* Remove masking after load since our load guarantees this is not needed */
3958 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3959 {
3960     struct nfp_insn_meta *meta1, *meta2;
3961     static const s32 exp_mask[] = {
3962         [BPF_B] = 0x000000ffU,
3963         [BPF_H] = 0x0000ffffU,
3964         [BPF_W] = 0xffffffffU,
3965     };
3966 
3967     nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3968         struct bpf_insn insn, next;
3969 
3970         insn = meta1->insn;
3971         next = meta2->insn;
3972 
3973         if (BPF_CLASS(insn.code) != BPF_LD)
3974             continue;
3975         if (BPF_MODE(insn.code) != BPF_ABS &&
3976             BPF_MODE(insn.code) != BPF_IND)
3977             continue;
3978 
3979         if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3980             continue;
3981 
3982         if (!exp_mask[BPF_SIZE(insn.code)])
3983             continue;
3984         if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3985             continue;
3986 
3987         if (next.src_reg || next.dst_reg)
3988             continue;
3989 
3990         if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3991             continue;
3992 
3993         meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3994     }
3995 }
3996 
3997 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3998 {
3999     struct nfp_insn_meta *meta1, *meta2, *meta3;
4000 
4001     nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
4002         struct bpf_insn insn, next1, next2;
4003 
4004         insn = meta1->insn;
4005         next1 = meta2->insn;
4006         next2 = meta3->insn;
4007 
4008         if (BPF_CLASS(insn.code) != BPF_LD)
4009             continue;
4010         if (BPF_MODE(insn.code) != BPF_ABS &&
4011             BPF_MODE(insn.code) != BPF_IND)
4012             continue;
4013         if (BPF_SIZE(insn.code) != BPF_W)
4014             continue;
4015 
4016         if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
4017               next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
4018             !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
4019               next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
4020             continue;
4021 
4022         if (next1.src_reg || next1.dst_reg ||
4023             next2.src_reg || next2.dst_reg)
4024             continue;
4025 
4026         if (next1.imm != 0x20 || next2.imm != 0x20)
4027             continue;
4028 
4029         if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
4030             meta3->flags & FLAG_INSN_IS_JUMP_DST)
4031             continue;
4032 
4033         meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4034         meta3->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4035     }
4036 }
4037 
4038 /* load/store pair that forms memory copy sould look like the following:
4039  *
4040  *   ld_width R, [addr_src + offset_src]
4041  *   st_width [addr_dest + offset_dest], R
4042  *
4043  * The destination register of load and source register of store should
4044  * be the same, load and store should also perform at the same width.
4045  * If either of addr_src or addr_dest is stack pointer, we don't do the
4046  * CPP optimization as stack is modelled by registers on NFP.
4047  */
4048 static bool
4049 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
4050             struct nfp_insn_meta *st_meta)
4051 {
4052     struct bpf_insn *ld = &ld_meta->insn;
4053     struct bpf_insn *st = &st_meta->insn;
4054 
4055     if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
4056         return false;
4057 
4058     if (ld_meta->ptr.type != PTR_TO_PACKET &&
4059         ld_meta->ptr.type != PTR_TO_MAP_VALUE)
4060         return false;
4061 
4062     if (st_meta->ptr.type != PTR_TO_PACKET)
4063         return false;
4064 
4065     if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
4066         return false;
4067 
4068     if (ld->dst_reg != st->src_reg)
4069         return false;
4070 
4071     /* There is jump to the store insn in this pair. */
4072     if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
4073         return false;
4074 
4075     return true;
4076 }
4077 
4078 /* Currently, we only support chaining load/store pairs if:
4079  *
4080  *  - Their address base registers are the same.
4081  *  - Their address offsets are in the same order.
4082  *  - They operate at the same memory width.
4083  *  - There is no jump into the middle of them.
4084  */
4085 static bool
4086 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
4087                   struct nfp_insn_meta *st_meta,
4088                   struct bpf_insn *prev_ld,
4089                   struct bpf_insn *prev_st)
4090 {
4091     u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
4092     struct bpf_insn *ld = &ld_meta->insn;
4093     struct bpf_insn *st = &st_meta->insn;
4094     s16 prev_ld_off, prev_st_off;
4095 
4096     /* This pair is the start pair. */
4097     if (!prev_ld)
4098         return true;
4099 
4100     prev_size = BPF_LDST_BYTES(prev_ld);
4101     curr_size = BPF_LDST_BYTES(ld);
4102     prev_ld_base = prev_ld->src_reg;
4103     prev_st_base = prev_st->dst_reg;
4104     prev_ld_dst = prev_ld->dst_reg;
4105     prev_ld_off = prev_ld->off;
4106     prev_st_off = prev_st->off;
4107 
4108     if (ld->dst_reg != prev_ld_dst)
4109         return false;
4110 
4111     if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
4112         return false;
4113 
4114     if (curr_size != prev_size)
4115         return false;
4116 
4117     /* There is jump to the head of this pair. */
4118     if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
4119         return false;
4120 
4121     /* Both in ascending order. */
4122     if (prev_ld_off + prev_size == ld->off &&
4123         prev_st_off + prev_size == st->off)
4124         return true;
4125 
4126     /* Both in descending order. */
4127     if (ld->off + curr_size == prev_ld_off &&
4128         st->off + curr_size == prev_st_off)
4129         return true;
4130 
4131     return false;
4132 }
4133 
4134 /* Return TRUE if cross memory access happens. Cross memory access means
4135  * store area is overlapping with load area that a later load might load
4136  * the value from previous store, for this case we can't treat the sequence
4137  * as an memory copy.
4138  */
4139 static bool
4140 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
4141          struct nfp_insn_meta *head_st_meta)
4142 {
4143     s16 head_ld_off, head_st_off, ld_off;
4144 
4145     /* Different pointer types does not overlap. */
4146     if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
4147         return false;
4148 
4149     /* load and store are both PTR_TO_PACKET, check ID info.  */
4150     if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
4151         return true;
4152 
4153     /* Canonicalize the offsets. Turn all of them against the original
4154      * base register.
4155      */
4156     head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
4157     head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
4158     ld_off = ld->off + head_ld_meta->ptr.off;
4159 
4160     /* Ascending order cross. */
4161     if (ld_off > head_ld_off &&
4162         head_ld_off < head_st_off && ld_off >= head_st_off)
4163         return true;
4164 
4165     /* Descending order cross. */
4166     if (ld_off < head_ld_off &&
4167         head_ld_off > head_st_off && ld_off <= head_st_off)
4168         return true;
4169 
4170     return false;
4171 }
4172 
4173 /* This pass try to identify the following instructoin sequences.
4174  *
4175  *   load R, [regA + offA]
4176  *   store [regB + offB], R
4177  *   load R, [regA + offA + const_imm_A]
4178  *   store [regB + offB + const_imm_A], R
4179  *   load R, [regA + offA + 2 * const_imm_A]
4180  *   store [regB + offB + 2 * const_imm_A], R
4181  *   ...
4182  *
4183  * Above sequence is typically generated by compiler when lowering
4184  * memcpy. NFP prefer using CPP instructions to accelerate it.
4185  */
4186 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
4187 {
4188     struct nfp_insn_meta *head_ld_meta = NULL;
4189     struct nfp_insn_meta *head_st_meta = NULL;
4190     struct nfp_insn_meta *meta1, *meta2;
4191     struct bpf_insn *prev_ld = NULL;
4192     struct bpf_insn *prev_st = NULL;
4193     u8 count = 0;
4194 
4195     nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4196         struct bpf_insn *ld = &meta1->insn;
4197         struct bpf_insn *st = &meta2->insn;
4198 
4199         /* Reset record status if any of the following if true:
4200          *   - The current insn pair is not load/store.
4201          *   - The load/store pair doesn't chain with previous one.
4202          *   - The chained load/store pair crossed with previous pair.
4203          *   - The chained load/store pair has a total size of memory
4204          *     copy beyond 128 bytes which is the maximum length a
4205          *     single NFP CPP command can transfer.
4206          */
4207         if (!curr_pair_is_memcpy(meta1, meta2) ||
4208             !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
4209                            prev_st) ||
4210             (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
4211                                head_st_meta) ||
4212                       head_ld_meta->ldst_gather_len >= 128))) {
4213             if (!count)
4214                 continue;
4215 
4216             if (count > 1) {
4217                 s16 prev_ld_off = prev_ld->off;
4218                 s16 prev_st_off = prev_st->off;
4219                 s16 head_ld_off = head_ld_meta->insn.off;
4220 
4221                 if (prev_ld_off < head_ld_off) {
4222                     head_ld_meta->insn.off = prev_ld_off;
4223                     head_st_meta->insn.off = prev_st_off;
4224                     head_ld_meta->ldst_gather_len =
4225                         -head_ld_meta->ldst_gather_len;
4226                 }
4227 
4228                 head_ld_meta->paired_st = &head_st_meta->insn;
4229                 head_st_meta->flags |=
4230                     FLAG_INSN_SKIP_PREC_DEPENDENT;
4231             } else {
4232                 head_ld_meta->ldst_gather_len = 0;
4233             }
4234 
4235             /* If the chain is ended by an load/store pair then this
4236              * could serve as the new head of the next chain.
4237              */
4238             if (curr_pair_is_memcpy(meta1, meta2)) {
4239                 head_ld_meta = meta1;
4240                 head_st_meta = meta2;
4241                 head_ld_meta->ldst_gather_len =
4242                     BPF_LDST_BYTES(ld);
4243                 meta1 = nfp_meta_next(meta1);
4244                 meta2 = nfp_meta_next(meta2);
4245                 prev_ld = ld;
4246                 prev_st = st;
4247                 count = 1;
4248             } else {
4249                 head_ld_meta = NULL;
4250                 head_st_meta = NULL;
4251                 prev_ld = NULL;
4252                 prev_st = NULL;
4253                 count = 0;
4254             }
4255 
4256             continue;
4257         }
4258 
4259         if (!head_ld_meta) {
4260             head_ld_meta = meta1;
4261             head_st_meta = meta2;
4262         } else {
4263             meta1->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4264             meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4265         }
4266 
4267         head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
4268         meta1 = nfp_meta_next(meta1);
4269         meta2 = nfp_meta_next(meta2);
4270         prev_ld = ld;
4271         prev_st = st;
4272         count++;
4273     }
4274 }
4275 
4276 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
4277 {
4278     struct nfp_insn_meta *meta, *range_node = NULL;
4279     s16 range_start = 0, range_end = 0;
4280     bool cache_avail = false;
4281     struct bpf_insn *insn;
4282     s32 range_ptr_off = 0;
4283     u32 range_ptr_id = 0;
4284 
4285     list_for_each_entry(meta, &nfp_prog->insns, l) {
4286         if (meta->flags & FLAG_INSN_IS_JUMP_DST)
4287             cache_avail = false;
4288 
4289         if (meta->flags & FLAG_INSN_SKIP_MASK)
4290             continue;
4291 
4292         insn = &meta->insn;
4293 
4294         if (is_mbpf_store_pkt(meta) ||
4295             insn->code == (BPF_JMP | BPF_CALL) ||
4296             is_mbpf_classic_store_pkt(meta) ||
4297             is_mbpf_classic_load(meta)) {
4298             cache_avail = false;
4299             continue;
4300         }
4301 
4302         if (!is_mbpf_load(meta))
4303             continue;
4304 
4305         if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
4306             cache_avail = false;
4307             continue;
4308         }
4309 
4310         if (!cache_avail) {
4311             cache_avail = true;
4312             if (range_node)
4313                 goto end_current_then_start_new;
4314             goto start_new;
4315         }
4316 
4317         /* Check ID to make sure two reads share the same
4318          * variable offset against PTR_TO_PACKET, and check OFF
4319          * to make sure they also share the same constant
4320          * offset.
4321          *
4322          * OFFs don't really need to be the same, because they
4323          * are the constant offsets against PTR_TO_PACKET, so
4324          * for different OFFs, we could canonicalize them to
4325          * offsets against original packet pointer. We don't
4326          * support this.
4327          */
4328         if (meta->ptr.id == range_ptr_id &&
4329             meta->ptr.off == range_ptr_off) {
4330             s16 new_start = range_start;
4331             s16 end, off = insn->off;
4332             s16 new_end = range_end;
4333             bool changed = false;
4334 
4335             if (off < range_start) {
4336                 new_start = off;
4337                 changed = true;
4338             }
4339 
4340             end = off + BPF_LDST_BYTES(insn);
4341             if (end > range_end) {
4342                 new_end = end;
4343                 changed = true;
4344             }
4345 
4346             if (!changed)
4347                 continue;
4348 
4349             if (new_end - new_start <= 64) {
4350                 /* Install new range. */
4351                 range_start = new_start;
4352                 range_end = new_end;
4353                 continue;
4354             }
4355         }
4356 
4357 end_current_then_start_new:
4358         range_node->pkt_cache.range_start = range_start;
4359         range_node->pkt_cache.range_end = range_end;
4360 start_new:
4361         range_node = meta;
4362         range_node->pkt_cache.do_init = true;
4363         range_ptr_id = range_node->ptr.id;
4364         range_ptr_off = range_node->ptr.off;
4365         range_start = insn->off;
4366         range_end = insn->off + BPF_LDST_BYTES(insn);
4367     }
4368 
4369     if (range_node) {
4370         range_node->pkt_cache.range_start = range_start;
4371         range_node->pkt_cache.range_end = range_end;
4372     }
4373 
4374     list_for_each_entry(meta, &nfp_prog->insns, l) {
4375         if (meta->flags & FLAG_INSN_SKIP_MASK)
4376             continue;
4377 
4378         if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
4379             if (meta->pkt_cache.do_init) {
4380                 range_start = meta->pkt_cache.range_start;
4381                 range_end = meta->pkt_cache.range_end;
4382             } else {
4383                 meta->pkt_cache.range_start = range_start;
4384                 meta->pkt_cache.range_end = range_end;
4385             }
4386         }
4387     }
4388 }
4389 
4390 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
4391 {
4392     nfp_bpf_opt_reg_init(nfp_prog);
4393 
4394     nfp_bpf_opt_neg_add_sub(nfp_prog);
4395     nfp_bpf_opt_ld_mask(nfp_prog);
4396     nfp_bpf_opt_ld_shift(nfp_prog);
4397     nfp_bpf_opt_ldst_gather(nfp_prog);
4398     nfp_bpf_opt_pkt_cache(nfp_prog);
4399 
4400     return 0;
4401 }
4402 
4403 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
4404 {
4405     struct nfp_insn_meta *meta1, *meta2;
4406     struct nfp_bpf_map *nfp_map;
4407     struct bpf_map *map;
4408     u32 id;
4409 
4410     nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4411         if (meta1->flags & FLAG_INSN_SKIP_MASK ||
4412             meta2->flags & FLAG_INSN_SKIP_MASK)
4413             continue;
4414 
4415         if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
4416             meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
4417             continue;
4418 
4419         map = (void *)(unsigned long)((u32)meta1->insn.imm |
4420                           (u64)meta2->insn.imm << 32);
4421         if (bpf_map_offload_neutral(map)) {
4422             id = map->id;
4423         } else {
4424             nfp_map = map_to_offmap(map)->dev_priv;
4425             id = nfp_map->tid;
4426         }
4427 
4428         meta1->insn.imm = id;
4429         meta2->insn.imm = 0;
4430     }
4431 
4432     return 0;
4433 }
4434 
4435 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
4436 {
4437     __le64 *ustore = (__force __le64 *)prog;
4438     int i;
4439 
4440     for (i = 0; i < len; i++) {
4441         int err;
4442 
4443         err = nfp_ustore_check_valid_no_ecc(prog[i]);
4444         if (err)
4445             return err;
4446 
4447         ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
4448     }
4449 
4450     return 0;
4451 }
4452 
4453 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
4454 {
4455     void *prog;
4456 
4457     prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
4458     if (!prog)
4459         return;
4460 
4461     nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
4462     memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
4463     kvfree(nfp_prog->prog);
4464     nfp_prog->prog = prog;
4465 }
4466 
4467 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
4468 {
4469     int ret;
4470 
4471     ret = nfp_bpf_replace_map_ptrs(nfp_prog);
4472     if (ret)
4473         return ret;
4474 
4475     ret = nfp_bpf_optimize(nfp_prog);
4476     if (ret)
4477         return ret;
4478 
4479     ret = nfp_translate(nfp_prog);
4480     if (ret) {
4481         pr_err("Translation failed with error %d (translated: %u)\n",
4482                ret, nfp_prog->n_translated);
4483         return -EINVAL;
4484     }
4485 
4486     nfp_bpf_prog_trim(nfp_prog);
4487 
4488     return ret;
4489 }
4490 
4491 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog)
4492 {
4493     struct nfp_insn_meta *meta;
4494 
4495     /* Another pass to record jump information. */
4496     list_for_each_entry(meta, &nfp_prog->insns, l) {
4497         struct nfp_insn_meta *dst_meta;
4498         u64 code = meta->insn.code;
4499         unsigned int dst_idx;
4500         bool pseudo_call;
4501 
4502         if (!is_mbpf_jmp(meta))
4503             continue;
4504         if (BPF_OP(code) == BPF_EXIT)
4505             continue;
4506         if (is_mbpf_helper_call(meta))
4507             continue;
4508 
4509         /* If opcode is BPF_CALL at this point, this can only be a
4510          * BPF-to-BPF call (a.k.a pseudo call).
4511          */
4512         pseudo_call = BPF_OP(code) == BPF_CALL;
4513 
4514         if (pseudo_call)
4515             dst_idx = meta->n + 1 + meta->insn.imm;
4516         else
4517             dst_idx = meta->n + 1 + meta->insn.off;
4518 
4519         dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx);
4520 
4521         if (pseudo_call)
4522             dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4523 
4524         dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4525         meta->jmp_dst = dst_meta;
4526     }
4527 }
4528 
4529 bool nfp_bpf_supported_opcode(u8 code)
4530 {
4531     return !!instr_cb[code];
4532 }
4533 
4534 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4535 {
4536     unsigned int i;
4537     u64 *prog;
4538     int err;
4539 
4540     prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4541                GFP_KERNEL);
4542     if (!prog)
4543         return ERR_PTR(-ENOMEM);
4544 
4545     for (i = 0; i < nfp_prog->prog_len; i++) {
4546         enum nfp_relo_type special;
4547         u32 val;
4548         u16 off;
4549 
4550         special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4551         switch (special) {
4552         case RELO_NONE:
4553             continue;
4554         case RELO_BR_REL:
4555             br_add_offset(&prog[i], bv->start_off);
4556             break;
4557         case RELO_BR_GO_OUT:
4558             br_set_offset(&prog[i],
4559                       nfp_prog->tgt_out + bv->start_off);
4560             break;
4561         case RELO_BR_GO_ABORT:
4562             br_set_offset(&prog[i],
4563                       nfp_prog->tgt_abort + bv->start_off);
4564             break;
4565         case RELO_BR_GO_CALL_PUSH_REGS:
4566             if (!nfp_prog->tgt_call_push_regs) {
4567                 pr_err("BUG: failed to detect subprogram registers needs\n");
4568                 err = -EINVAL;
4569                 goto err_free_prog;
4570             }
4571             off = nfp_prog->tgt_call_push_regs + bv->start_off;
4572             br_set_offset(&prog[i], off);
4573             break;
4574         case RELO_BR_GO_CALL_POP_REGS:
4575             if (!nfp_prog->tgt_call_pop_regs) {
4576                 pr_err("BUG: failed to detect subprogram registers needs\n");
4577                 err = -EINVAL;
4578                 goto err_free_prog;
4579             }
4580             off = nfp_prog->tgt_call_pop_regs + bv->start_off;
4581             br_set_offset(&prog[i], off);
4582             break;
4583         case RELO_BR_NEXT_PKT:
4584             br_set_offset(&prog[i], bv->tgt_done);
4585             break;
4586         case RELO_BR_HELPER:
4587             val = br_get_offset(prog[i]);
4588             val -= BR_OFF_RELO;
4589             switch (val) {
4590             case BPF_FUNC_map_lookup_elem:
4591                 val = nfp_prog->bpf->helpers.map_lookup;
4592                 break;
4593             case BPF_FUNC_map_update_elem:
4594                 val = nfp_prog->bpf->helpers.map_update;
4595                 break;
4596             case BPF_FUNC_map_delete_elem:
4597                 val = nfp_prog->bpf->helpers.map_delete;
4598                 break;
4599             case BPF_FUNC_perf_event_output:
4600                 val = nfp_prog->bpf->helpers.perf_event_output;
4601                 break;
4602             default:
4603                 pr_err("relocation of unknown helper %d\n",
4604                        val);
4605                 err = -EINVAL;
4606                 goto err_free_prog;
4607             }
4608             br_set_offset(&prog[i], val);
4609             break;
4610         case RELO_IMMED_REL:
4611             immed_add_value(&prog[i], bv->start_off);
4612             break;
4613         }
4614 
4615         prog[i] &= ~OP_RELO_TYPE;
4616     }
4617 
4618     err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4619     if (err)
4620         goto err_free_prog;
4621 
4622     return prog;
4623 
4624 err_free_prog:
4625     kfree(prog);
4626     return ERR_PTR(err);
4627 }