Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * NVMe over Fabrics TCP host.
0004  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
0005  */
0006 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
0007 #include <linux/module.h>
0008 #include <linux/init.h>
0009 #include <linux/slab.h>
0010 #include <linux/err.h>
0011 #include <linux/nvme-tcp.h>
0012 #include <net/sock.h>
0013 #include <net/tcp.h>
0014 #include <linux/blk-mq.h>
0015 #include <crypto/hash.h>
0016 #include <net/busy_poll.h>
0017 
0018 #include "nvme.h"
0019 #include "fabrics.h"
0020 
0021 struct nvme_tcp_queue;
0022 
0023 /* Define the socket priority to use for connections were it is desirable
0024  * that the NIC consider performing optimized packet processing or filtering.
0025  * A non-zero value being sufficient to indicate general consideration of any
0026  * possible optimization.  Making it a module param allows for alternative
0027  * values that may be unique for some NIC implementations.
0028  */
0029 static int so_priority;
0030 module_param(so_priority, int, 0644);
0031 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
0032 
0033 #ifdef CONFIG_DEBUG_LOCK_ALLOC
0034 /* lockdep can detect a circular dependency of the form
0035  *   sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
0036  * because dependencies are tracked for both nvme-tcp and user contexts. Using
0037  * a separate class prevents lockdep from conflating nvme-tcp socket use with
0038  * user-space socket API use.
0039  */
0040 static struct lock_class_key nvme_tcp_sk_key[2];
0041 static struct lock_class_key nvme_tcp_slock_key[2];
0042 
0043 static void nvme_tcp_reclassify_socket(struct socket *sock)
0044 {
0045     struct sock *sk = sock->sk;
0046 
0047     if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
0048         return;
0049 
0050     switch (sk->sk_family) {
0051     case AF_INET:
0052         sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
0053                           &nvme_tcp_slock_key[0],
0054                           "sk_lock-AF_INET-NVME",
0055                           &nvme_tcp_sk_key[0]);
0056         break;
0057     case AF_INET6:
0058         sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
0059                           &nvme_tcp_slock_key[1],
0060                           "sk_lock-AF_INET6-NVME",
0061                           &nvme_tcp_sk_key[1]);
0062         break;
0063     default:
0064         WARN_ON_ONCE(1);
0065     }
0066 }
0067 #else
0068 static void nvme_tcp_reclassify_socket(struct socket *sock) { }
0069 #endif
0070 
0071 enum nvme_tcp_send_state {
0072     NVME_TCP_SEND_CMD_PDU = 0,
0073     NVME_TCP_SEND_H2C_PDU,
0074     NVME_TCP_SEND_DATA,
0075     NVME_TCP_SEND_DDGST,
0076 };
0077 
0078 struct nvme_tcp_request {
0079     struct nvme_request req;
0080     void            *pdu;
0081     struct nvme_tcp_queue   *queue;
0082     u32         data_len;
0083     u32         pdu_len;
0084     u32         pdu_sent;
0085     u32         h2cdata_left;
0086     u32         h2cdata_offset;
0087     u16         ttag;
0088     __le16          status;
0089     struct list_head    entry;
0090     struct llist_node   lentry;
0091     __le32          ddgst;
0092 
0093     struct bio      *curr_bio;
0094     struct iov_iter     iter;
0095 
0096     /* send state */
0097     size_t          offset;
0098     size_t          data_sent;
0099     enum nvme_tcp_send_state state;
0100 };
0101 
0102 enum nvme_tcp_queue_flags {
0103     NVME_TCP_Q_ALLOCATED    = 0,
0104     NVME_TCP_Q_LIVE     = 1,
0105     NVME_TCP_Q_POLLING  = 2,
0106 };
0107 
0108 enum nvme_tcp_recv_state {
0109     NVME_TCP_RECV_PDU = 0,
0110     NVME_TCP_RECV_DATA,
0111     NVME_TCP_RECV_DDGST,
0112 };
0113 
0114 struct nvme_tcp_ctrl;
0115 struct nvme_tcp_queue {
0116     struct socket       *sock;
0117     struct work_struct  io_work;
0118     int         io_cpu;
0119 
0120     struct mutex        queue_lock;
0121     struct mutex        send_mutex;
0122     struct llist_head   req_list;
0123     struct list_head    send_list;
0124 
0125     /* recv state */
0126     void            *pdu;
0127     int         pdu_remaining;
0128     int         pdu_offset;
0129     size_t          data_remaining;
0130     size_t          ddgst_remaining;
0131     unsigned int        nr_cqe;
0132 
0133     /* send state */
0134     struct nvme_tcp_request *request;
0135 
0136     int         queue_size;
0137     u32         maxh2cdata;
0138     size_t          cmnd_capsule_len;
0139     struct nvme_tcp_ctrl    *ctrl;
0140     unsigned long       flags;
0141     bool            rd_enabled;
0142 
0143     bool            hdr_digest;
0144     bool            data_digest;
0145     struct ahash_request    *rcv_hash;
0146     struct ahash_request    *snd_hash;
0147     __le32          exp_ddgst;
0148     __le32          recv_ddgst;
0149 
0150     struct page_frag_cache  pf_cache;
0151 
0152     void (*state_change)(struct sock *);
0153     void (*data_ready)(struct sock *);
0154     void (*write_space)(struct sock *);
0155 };
0156 
0157 struct nvme_tcp_ctrl {
0158     /* read only in the hot path */
0159     struct nvme_tcp_queue   *queues;
0160     struct blk_mq_tag_set   tag_set;
0161 
0162     /* other member variables */
0163     struct list_head    list;
0164     struct blk_mq_tag_set   admin_tag_set;
0165     struct sockaddr_storage addr;
0166     struct sockaddr_storage src_addr;
0167     struct nvme_ctrl    ctrl;
0168 
0169     struct work_struct  err_work;
0170     struct delayed_work connect_work;
0171     struct nvme_tcp_request async_req;
0172     u32         io_queues[HCTX_MAX_TYPES];
0173 };
0174 
0175 static LIST_HEAD(nvme_tcp_ctrl_list);
0176 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
0177 static struct workqueue_struct *nvme_tcp_wq;
0178 static const struct blk_mq_ops nvme_tcp_mq_ops;
0179 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
0180 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
0181 
0182 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
0183 {
0184     return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
0185 }
0186 
0187 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
0188 {
0189     return queue - queue->ctrl->queues;
0190 }
0191 
0192 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
0193 {
0194     u32 queue_idx = nvme_tcp_queue_id(queue);
0195 
0196     if (queue_idx == 0)
0197         return queue->ctrl->admin_tag_set.tags[queue_idx];
0198     return queue->ctrl->tag_set.tags[queue_idx - 1];
0199 }
0200 
0201 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
0202 {
0203     return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
0204 }
0205 
0206 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
0207 {
0208     return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
0209 }
0210 
0211 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
0212 {
0213     if (nvme_is_fabrics(req->req.cmd))
0214         return NVME_TCP_ADMIN_CCSZ;
0215     return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
0216 }
0217 
0218 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
0219 {
0220     return req == &req->queue->ctrl->async_req;
0221 }
0222 
0223 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
0224 {
0225     struct request *rq;
0226 
0227     if (unlikely(nvme_tcp_async_req(req)))
0228         return false; /* async events don't have a request */
0229 
0230     rq = blk_mq_rq_from_pdu(req);
0231 
0232     return rq_data_dir(rq) == WRITE && req->data_len &&
0233         req->data_len <= nvme_tcp_inline_data_size(req);
0234 }
0235 
0236 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
0237 {
0238     return req->iter.bvec->bv_page;
0239 }
0240 
0241 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
0242 {
0243     return req->iter.bvec->bv_offset + req->iter.iov_offset;
0244 }
0245 
0246 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
0247 {
0248     return min_t(size_t, iov_iter_single_seg_count(&req->iter),
0249             req->pdu_len - req->pdu_sent);
0250 }
0251 
0252 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
0253 {
0254     return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
0255             req->pdu_len - req->pdu_sent : 0;
0256 }
0257 
0258 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
0259         int len)
0260 {
0261     return nvme_tcp_pdu_data_left(req) <= len;
0262 }
0263 
0264 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
0265         unsigned int dir)
0266 {
0267     struct request *rq = blk_mq_rq_from_pdu(req);
0268     struct bio_vec *vec;
0269     unsigned int size;
0270     int nr_bvec;
0271     size_t offset;
0272 
0273     if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
0274         vec = &rq->special_vec;
0275         nr_bvec = 1;
0276         size = blk_rq_payload_bytes(rq);
0277         offset = 0;
0278     } else {
0279         struct bio *bio = req->curr_bio;
0280         struct bvec_iter bi;
0281         struct bio_vec bv;
0282 
0283         vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
0284         nr_bvec = 0;
0285         bio_for_each_bvec(bv, bio, bi) {
0286             nr_bvec++;
0287         }
0288         size = bio->bi_iter.bi_size;
0289         offset = bio->bi_iter.bi_bvec_done;
0290     }
0291 
0292     iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
0293     req->iter.iov_offset = offset;
0294 }
0295 
0296 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
0297         int len)
0298 {
0299     req->data_sent += len;
0300     req->pdu_sent += len;
0301     iov_iter_advance(&req->iter, len);
0302     if (!iov_iter_count(&req->iter) &&
0303         req->data_sent < req->data_len) {
0304         req->curr_bio = req->curr_bio->bi_next;
0305         nvme_tcp_init_iter(req, WRITE);
0306     }
0307 }
0308 
0309 static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
0310 {
0311     int ret;
0312 
0313     /* drain the send queue as much as we can... */
0314     do {
0315         ret = nvme_tcp_try_send(queue);
0316     } while (ret > 0);
0317 }
0318 
0319 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
0320 {
0321     return !list_empty(&queue->send_list) ||
0322         !llist_empty(&queue->req_list);
0323 }
0324 
0325 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
0326         bool sync, bool last)
0327 {
0328     struct nvme_tcp_queue *queue = req->queue;
0329     bool empty;
0330 
0331     empty = llist_add(&req->lentry, &queue->req_list) &&
0332         list_empty(&queue->send_list) && !queue->request;
0333 
0334     /*
0335      * if we're the first on the send_list and we can try to send
0336      * directly, otherwise queue io_work. Also, only do that if we
0337      * are on the same cpu, so we don't introduce contention.
0338      */
0339     if (queue->io_cpu == raw_smp_processor_id() &&
0340         sync && empty && mutex_trylock(&queue->send_mutex)) {
0341         nvme_tcp_send_all(queue);
0342         mutex_unlock(&queue->send_mutex);
0343     }
0344 
0345     if (last && nvme_tcp_queue_more(queue))
0346         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
0347 }
0348 
0349 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
0350 {
0351     struct nvme_tcp_request *req;
0352     struct llist_node *node;
0353 
0354     for (node = llist_del_all(&queue->req_list); node; node = node->next) {
0355         req = llist_entry(node, struct nvme_tcp_request, lentry);
0356         list_add(&req->entry, &queue->send_list);
0357     }
0358 }
0359 
0360 static inline struct nvme_tcp_request *
0361 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
0362 {
0363     struct nvme_tcp_request *req;
0364 
0365     req = list_first_entry_or_null(&queue->send_list,
0366             struct nvme_tcp_request, entry);
0367     if (!req) {
0368         nvme_tcp_process_req_list(queue);
0369         req = list_first_entry_or_null(&queue->send_list,
0370                 struct nvme_tcp_request, entry);
0371         if (unlikely(!req))
0372             return NULL;
0373     }
0374 
0375     list_del(&req->entry);
0376     return req;
0377 }
0378 
0379 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
0380         __le32 *dgst)
0381 {
0382     ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
0383     crypto_ahash_final(hash);
0384 }
0385 
0386 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
0387         struct page *page, off_t off, size_t len)
0388 {
0389     struct scatterlist sg;
0390 
0391     sg_init_marker(&sg, 1);
0392     sg_set_page(&sg, page, len, off);
0393     ahash_request_set_crypt(hash, &sg, NULL, len);
0394     crypto_ahash_update(hash);
0395 }
0396 
0397 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
0398         void *pdu, size_t len)
0399 {
0400     struct scatterlist sg;
0401 
0402     sg_init_one(&sg, pdu, len);
0403     ahash_request_set_crypt(hash, &sg, pdu + len, len);
0404     crypto_ahash_digest(hash);
0405 }
0406 
0407 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
0408         void *pdu, size_t pdu_len)
0409 {
0410     struct nvme_tcp_hdr *hdr = pdu;
0411     __le32 recv_digest;
0412     __le32 exp_digest;
0413 
0414     if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
0415         dev_err(queue->ctrl->ctrl.device,
0416             "queue %d: header digest flag is cleared\n",
0417             nvme_tcp_queue_id(queue));
0418         return -EPROTO;
0419     }
0420 
0421     recv_digest = *(__le32 *)(pdu + hdr->hlen);
0422     nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
0423     exp_digest = *(__le32 *)(pdu + hdr->hlen);
0424     if (recv_digest != exp_digest) {
0425         dev_err(queue->ctrl->ctrl.device,
0426             "header digest error: recv %#x expected %#x\n",
0427             le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
0428         return -EIO;
0429     }
0430 
0431     return 0;
0432 }
0433 
0434 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
0435 {
0436     struct nvme_tcp_hdr *hdr = pdu;
0437     u8 digest_len = nvme_tcp_hdgst_len(queue);
0438     u32 len;
0439 
0440     len = le32_to_cpu(hdr->plen) - hdr->hlen -
0441         ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
0442 
0443     if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
0444         dev_err(queue->ctrl->ctrl.device,
0445             "queue %d: data digest flag is cleared\n",
0446         nvme_tcp_queue_id(queue));
0447         return -EPROTO;
0448     }
0449     crypto_ahash_init(queue->rcv_hash);
0450 
0451     return 0;
0452 }
0453 
0454 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
0455         struct request *rq, unsigned int hctx_idx)
0456 {
0457     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
0458 
0459     page_frag_free(req->pdu);
0460 }
0461 
0462 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
0463         struct request *rq, unsigned int hctx_idx,
0464         unsigned int numa_node)
0465 {
0466     struct nvme_tcp_ctrl *ctrl = set->driver_data;
0467     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
0468     struct nvme_tcp_cmd_pdu *pdu;
0469     int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
0470     struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
0471     u8 hdgst = nvme_tcp_hdgst_len(queue);
0472 
0473     req->pdu = page_frag_alloc(&queue->pf_cache,
0474         sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
0475         GFP_KERNEL | __GFP_ZERO);
0476     if (!req->pdu)
0477         return -ENOMEM;
0478 
0479     pdu = req->pdu;
0480     req->queue = queue;
0481     nvme_req(rq)->ctrl = &ctrl->ctrl;
0482     nvme_req(rq)->cmd = &pdu->cmd;
0483 
0484     return 0;
0485 }
0486 
0487 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
0488         unsigned int hctx_idx)
0489 {
0490     struct nvme_tcp_ctrl *ctrl = data;
0491     struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
0492 
0493     hctx->driver_data = queue;
0494     return 0;
0495 }
0496 
0497 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
0498         unsigned int hctx_idx)
0499 {
0500     struct nvme_tcp_ctrl *ctrl = data;
0501     struct nvme_tcp_queue *queue = &ctrl->queues[0];
0502 
0503     hctx->driver_data = queue;
0504     return 0;
0505 }
0506 
0507 static enum nvme_tcp_recv_state
0508 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
0509 {
0510     return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
0511         (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
0512         NVME_TCP_RECV_DATA;
0513 }
0514 
0515 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
0516 {
0517     queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
0518                 nvme_tcp_hdgst_len(queue);
0519     queue->pdu_offset = 0;
0520     queue->data_remaining = -1;
0521     queue->ddgst_remaining = 0;
0522 }
0523 
0524 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
0525 {
0526     if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
0527         return;
0528 
0529     dev_warn(ctrl->device, "starting error recovery\n");
0530     queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
0531 }
0532 
0533 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
0534         struct nvme_completion *cqe)
0535 {
0536     struct nvme_tcp_request *req;
0537     struct request *rq;
0538 
0539     rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
0540     if (!rq) {
0541         dev_err(queue->ctrl->ctrl.device,
0542             "got bad cqe.command_id %#x on queue %d\n",
0543             cqe->command_id, nvme_tcp_queue_id(queue));
0544         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
0545         return -EINVAL;
0546     }
0547 
0548     req = blk_mq_rq_to_pdu(rq);
0549     if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
0550         req->status = cqe->status;
0551 
0552     if (!nvme_try_complete_req(rq, req->status, cqe->result))
0553         nvme_complete_rq(rq);
0554     queue->nr_cqe++;
0555 
0556     return 0;
0557 }
0558 
0559 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
0560         struct nvme_tcp_data_pdu *pdu)
0561 {
0562     struct request *rq;
0563 
0564     rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
0565     if (!rq) {
0566         dev_err(queue->ctrl->ctrl.device,
0567             "got bad c2hdata.command_id %#x on queue %d\n",
0568             pdu->command_id, nvme_tcp_queue_id(queue));
0569         return -ENOENT;
0570     }
0571 
0572     if (!blk_rq_payload_bytes(rq)) {
0573         dev_err(queue->ctrl->ctrl.device,
0574             "queue %d tag %#x unexpected data\n",
0575             nvme_tcp_queue_id(queue), rq->tag);
0576         return -EIO;
0577     }
0578 
0579     queue->data_remaining = le32_to_cpu(pdu->data_length);
0580 
0581     if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
0582         unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
0583         dev_err(queue->ctrl->ctrl.device,
0584             "queue %d tag %#x SUCCESS set but not last PDU\n",
0585             nvme_tcp_queue_id(queue), rq->tag);
0586         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
0587         return -EPROTO;
0588     }
0589 
0590     return 0;
0591 }
0592 
0593 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
0594         struct nvme_tcp_rsp_pdu *pdu)
0595 {
0596     struct nvme_completion *cqe = &pdu->cqe;
0597     int ret = 0;
0598 
0599     /*
0600      * AEN requests are special as they don't time out and can
0601      * survive any kind of queue freeze and often don't respond to
0602      * aborts.  We don't even bother to allocate a struct request
0603      * for them but rather special case them here.
0604      */
0605     if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
0606                      cqe->command_id)))
0607         nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
0608                 &cqe->result);
0609     else
0610         ret = nvme_tcp_process_nvme_cqe(queue, cqe);
0611 
0612     return ret;
0613 }
0614 
0615 static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
0616 {
0617     struct nvme_tcp_data_pdu *data = req->pdu;
0618     struct nvme_tcp_queue *queue = req->queue;
0619     struct request *rq = blk_mq_rq_from_pdu(req);
0620     u32 h2cdata_sent = req->pdu_len;
0621     u8 hdgst = nvme_tcp_hdgst_len(queue);
0622     u8 ddgst = nvme_tcp_ddgst_len(queue);
0623 
0624     req->state = NVME_TCP_SEND_H2C_PDU;
0625     req->offset = 0;
0626     req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
0627     req->pdu_sent = 0;
0628     req->h2cdata_left -= req->pdu_len;
0629     req->h2cdata_offset += h2cdata_sent;
0630 
0631     memset(data, 0, sizeof(*data));
0632     data->hdr.type = nvme_tcp_h2c_data;
0633     if (!req->h2cdata_left)
0634         data->hdr.flags = NVME_TCP_F_DATA_LAST;
0635     if (queue->hdr_digest)
0636         data->hdr.flags |= NVME_TCP_F_HDGST;
0637     if (queue->data_digest)
0638         data->hdr.flags |= NVME_TCP_F_DDGST;
0639     data->hdr.hlen = sizeof(*data);
0640     data->hdr.pdo = data->hdr.hlen + hdgst;
0641     data->hdr.plen =
0642         cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
0643     data->ttag = req->ttag;
0644     data->command_id = nvme_cid(rq);
0645     data->data_offset = cpu_to_le32(req->h2cdata_offset);
0646     data->data_length = cpu_to_le32(req->pdu_len);
0647 }
0648 
0649 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
0650         struct nvme_tcp_r2t_pdu *pdu)
0651 {
0652     struct nvme_tcp_request *req;
0653     struct request *rq;
0654     u32 r2t_length = le32_to_cpu(pdu->r2t_length);
0655     u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
0656 
0657     rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
0658     if (!rq) {
0659         dev_err(queue->ctrl->ctrl.device,
0660             "got bad r2t.command_id %#x on queue %d\n",
0661             pdu->command_id, nvme_tcp_queue_id(queue));
0662         return -ENOENT;
0663     }
0664     req = blk_mq_rq_to_pdu(rq);
0665 
0666     if (unlikely(!r2t_length)) {
0667         dev_err(queue->ctrl->ctrl.device,
0668             "req %d r2t len is %u, probably a bug...\n",
0669             rq->tag, r2t_length);
0670         return -EPROTO;
0671     }
0672 
0673     if (unlikely(req->data_sent + r2t_length > req->data_len)) {
0674         dev_err(queue->ctrl->ctrl.device,
0675             "req %d r2t len %u exceeded data len %u (%zu sent)\n",
0676             rq->tag, r2t_length, req->data_len, req->data_sent);
0677         return -EPROTO;
0678     }
0679 
0680     if (unlikely(r2t_offset < req->data_sent)) {
0681         dev_err(queue->ctrl->ctrl.device,
0682             "req %d unexpected r2t offset %u (expected %zu)\n",
0683             rq->tag, r2t_offset, req->data_sent);
0684         return -EPROTO;
0685     }
0686 
0687     req->pdu_len = 0;
0688     req->h2cdata_left = r2t_length;
0689     req->h2cdata_offset = r2t_offset;
0690     req->ttag = pdu->ttag;
0691 
0692     nvme_tcp_setup_h2c_data_pdu(req);
0693     nvme_tcp_queue_request(req, false, true);
0694 
0695     return 0;
0696 }
0697 
0698 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
0699         unsigned int *offset, size_t *len)
0700 {
0701     struct nvme_tcp_hdr *hdr;
0702     char *pdu = queue->pdu;
0703     size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
0704     int ret;
0705 
0706     ret = skb_copy_bits(skb, *offset,
0707         &pdu[queue->pdu_offset], rcv_len);
0708     if (unlikely(ret))
0709         return ret;
0710 
0711     queue->pdu_remaining -= rcv_len;
0712     queue->pdu_offset += rcv_len;
0713     *offset += rcv_len;
0714     *len -= rcv_len;
0715     if (queue->pdu_remaining)
0716         return 0;
0717 
0718     hdr = queue->pdu;
0719     if (queue->hdr_digest) {
0720         ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
0721         if (unlikely(ret))
0722             return ret;
0723     }
0724 
0725 
0726     if (queue->data_digest) {
0727         ret = nvme_tcp_check_ddgst(queue, queue->pdu);
0728         if (unlikely(ret))
0729             return ret;
0730     }
0731 
0732     switch (hdr->type) {
0733     case nvme_tcp_c2h_data:
0734         return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
0735     case nvme_tcp_rsp:
0736         nvme_tcp_init_recv_ctx(queue);
0737         return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
0738     case nvme_tcp_r2t:
0739         nvme_tcp_init_recv_ctx(queue);
0740         return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
0741     default:
0742         dev_err(queue->ctrl->ctrl.device,
0743             "unsupported pdu type (%d)\n", hdr->type);
0744         return -EINVAL;
0745     }
0746 }
0747 
0748 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
0749 {
0750     union nvme_result res = {};
0751 
0752     if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
0753         nvme_complete_rq(rq);
0754 }
0755 
0756 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
0757                   unsigned int *offset, size_t *len)
0758 {
0759     struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
0760     struct request *rq =
0761         nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
0762     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
0763 
0764     while (true) {
0765         int recv_len, ret;
0766 
0767         recv_len = min_t(size_t, *len, queue->data_remaining);
0768         if (!recv_len)
0769             break;
0770 
0771         if (!iov_iter_count(&req->iter)) {
0772             req->curr_bio = req->curr_bio->bi_next;
0773 
0774             /*
0775              * If we don`t have any bios it means that controller
0776              * sent more data than we requested, hence error
0777              */
0778             if (!req->curr_bio) {
0779                 dev_err(queue->ctrl->ctrl.device,
0780                     "queue %d no space in request %#x",
0781                     nvme_tcp_queue_id(queue), rq->tag);
0782                 nvme_tcp_init_recv_ctx(queue);
0783                 return -EIO;
0784             }
0785             nvme_tcp_init_iter(req, READ);
0786         }
0787 
0788         /* we can read only from what is left in this bio */
0789         recv_len = min_t(size_t, recv_len,
0790                 iov_iter_count(&req->iter));
0791 
0792         if (queue->data_digest)
0793             ret = skb_copy_and_hash_datagram_iter(skb, *offset,
0794                 &req->iter, recv_len, queue->rcv_hash);
0795         else
0796             ret = skb_copy_datagram_iter(skb, *offset,
0797                     &req->iter, recv_len);
0798         if (ret) {
0799             dev_err(queue->ctrl->ctrl.device,
0800                 "queue %d failed to copy request %#x data",
0801                 nvme_tcp_queue_id(queue), rq->tag);
0802             return ret;
0803         }
0804 
0805         *len -= recv_len;
0806         *offset += recv_len;
0807         queue->data_remaining -= recv_len;
0808     }
0809 
0810     if (!queue->data_remaining) {
0811         if (queue->data_digest) {
0812             nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
0813             queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
0814         } else {
0815             if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
0816                 nvme_tcp_end_request(rq,
0817                         le16_to_cpu(req->status));
0818                 queue->nr_cqe++;
0819             }
0820             nvme_tcp_init_recv_ctx(queue);
0821         }
0822     }
0823 
0824     return 0;
0825 }
0826 
0827 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
0828         struct sk_buff *skb, unsigned int *offset, size_t *len)
0829 {
0830     struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
0831     char *ddgst = (char *)&queue->recv_ddgst;
0832     size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
0833     off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
0834     int ret;
0835 
0836     ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
0837     if (unlikely(ret))
0838         return ret;
0839 
0840     queue->ddgst_remaining -= recv_len;
0841     *offset += recv_len;
0842     *len -= recv_len;
0843     if (queue->ddgst_remaining)
0844         return 0;
0845 
0846     if (queue->recv_ddgst != queue->exp_ddgst) {
0847         struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
0848                     pdu->command_id);
0849         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
0850 
0851         req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
0852 
0853         dev_err(queue->ctrl->ctrl.device,
0854             "data digest error: recv %#x expected %#x\n",
0855             le32_to_cpu(queue->recv_ddgst),
0856             le32_to_cpu(queue->exp_ddgst));
0857     }
0858 
0859     if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
0860         struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
0861                     pdu->command_id);
0862         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
0863 
0864         nvme_tcp_end_request(rq, le16_to_cpu(req->status));
0865         queue->nr_cqe++;
0866     }
0867 
0868     nvme_tcp_init_recv_ctx(queue);
0869     return 0;
0870 }
0871 
0872 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
0873                  unsigned int offset, size_t len)
0874 {
0875     struct nvme_tcp_queue *queue = desc->arg.data;
0876     size_t consumed = len;
0877     int result;
0878 
0879     while (len) {
0880         switch (nvme_tcp_recv_state(queue)) {
0881         case NVME_TCP_RECV_PDU:
0882             result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
0883             break;
0884         case NVME_TCP_RECV_DATA:
0885             result = nvme_tcp_recv_data(queue, skb, &offset, &len);
0886             break;
0887         case NVME_TCP_RECV_DDGST:
0888             result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
0889             break;
0890         default:
0891             result = -EFAULT;
0892         }
0893         if (result) {
0894             dev_err(queue->ctrl->ctrl.device,
0895                 "receive failed:  %d\n", result);
0896             queue->rd_enabled = false;
0897             nvme_tcp_error_recovery(&queue->ctrl->ctrl);
0898             return result;
0899         }
0900     }
0901 
0902     return consumed;
0903 }
0904 
0905 static void nvme_tcp_data_ready(struct sock *sk)
0906 {
0907     struct nvme_tcp_queue *queue;
0908 
0909     read_lock_bh(&sk->sk_callback_lock);
0910     queue = sk->sk_user_data;
0911     if (likely(queue && queue->rd_enabled) &&
0912         !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
0913         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
0914     read_unlock_bh(&sk->sk_callback_lock);
0915 }
0916 
0917 static void nvme_tcp_write_space(struct sock *sk)
0918 {
0919     struct nvme_tcp_queue *queue;
0920 
0921     read_lock_bh(&sk->sk_callback_lock);
0922     queue = sk->sk_user_data;
0923     if (likely(queue && sk_stream_is_writeable(sk))) {
0924         clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
0925         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
0926     }
0927     read_unlock_bh(&sk->sk_callback_lock);
0928 }
0929 
0930 static void nvme_tcp_state_change(struct sock *sk)
0931 {
0932     struct nvme_tcp_queue *queue;
0933 
0934     read_lock_bh(&sk->sk_callback_lock);
0935     queue = sk->sk_user_data;
0936     if (!queue)
0937         goto done;
0938 
0939     switch (sk->sk_state) {
0940     case TCP_CLOSE:
0941     case TCP_CLOSE_WAIT:
0942     case TCP_LAST_ACK:
0943     case TCP_FIN_WAIT1:
0944     case TCP_FIN_WAIT2:
0945         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
0946         break;
0947     default:
0948         dev_info(queue->ctrl->ctrl.device,
0949             "queue %d socket state %d\n",
0950             nvme_tcp_queue_id(queue), sk->sk_state);
0951     }
0952 
0953     queue->state_change(sk);
0954 done:
0955     read_unlock_bh(&sk->sk_callback_lock);
0956 }
0957 
0958 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
0959 {
0960     queue->request = NULL;
0961 }
0962 
0963 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
0964 {
0965     if (nvme_tcp_async_req(req)) {
0966         union nvme_result res = {};
0967 
0968         nvme_complete_async_event(&req->queue->ctrl->ctrl,
0969                 cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
0970     } else {
0971         nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
0972                 NVME_SC_HOST_PATH_ERROR);
0973     }
0974 }
0975 
0976 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
0977 {
0978     struct nvme_tcp_queue *queue = req->queue;
0979     int req_data_len = req->data_len;
0980     u32 h2cdata_left = req->h2cdata_left;
0981 
0982     while (true) {
0983         struct page *page = nvme_tcp_req_cur_page(req);
0984         size_t offset = nvme_tcp_req_cur_offset(req);
0985         size_t len = nvme_tcp_req_cur_length(req);
0986         bool last = nvme_tcp_pdu_last_send(req, len);
0987         int req_data_sent = req->data_sent;
0988         int ret, flags = MSG_DONTWAIT;
0989 
0990         if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
0991             flags |= MSG_EOR;
0992         else
0993             flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
0994 
0995         if (sendpage_ok(page)) {
0996             ret = kernel_sendpage(queue->sock, page, offset, len,
0997                     flags);
0998         } else {
0999             ret = sock_no_sendpage(queue->sock, page, offset, len,
1000                     flags);
1001         }
1002         if (ret <= 0)
1003             return ret;
1004 
1005         if (queue->data_digest)
1006             nvme_tcp_ddgst_update(queue->snd_hash, page,
1007                     offset, ret);
1008 
1009         /*
1010          * update the request iterator except for the last payload send
1011          * in the request where we don't want to modify it as we may
1012          * compete with the RX path completing the request.
1013          */
1014         if (req_data_sent + ret < req_data_len)
1015             nvme_tcp_advance_req(req, ret);
1016 
1017         /* fully successful last send in current PDU */
1018         if (last && ret == len) {
1019             if (queue->data_digest) {
1020                 nvme_tcp_ddgst_final(queue->snd_hash,
1021                     &req->ddgst);
1022                 req->state = NVME_TCP_SEND_DDGST;
1023                 req->offset = 0;
1024             } else {
1025                 if (h2cdata_left)
1026                     nvme_tcp_setup_h2c_data_pdu(req);
1027                 else
1028                     nvme_tcp_done_send_req(queue);
1029             }
1030             return 1;
1031         }
1032     }
1033     return -EAGAIN;
1034 }
1035 
1036 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1037 {
1038     struct nvme_tcp_queue *queue = req->queue;
1039     struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1040     bool inline_data = nvme_tcp_has_inline_data(req);
1041     u8 hdgst = nvme_tcp_hdgst_len(queue);
1042     int len = sizeof(*pdu) + hdgst - req->offset;
1043     int flags = MSG_DONTWAIT;
1044     int ret;
1045 
1046     if (inline_data || nvme_tcp_queue_more(queue))
1047         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
1048     else
1049         flags |= MSG_EOR;
1050 
1051     if (queue->hdr_digest && !req->offset)
1052         nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1053 
1054     ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1055             offset_in_page(pdu) + req->offset, len,  flags);
1056     if (unlikely(ret <= 0))
1057         return ret;
1058 
1059     len -= ret;
1060     if (!len) {
1061         if (inline_data) {
1062             req->state = NVME_TCP_SEND_DATA;
1063             if (queue->data_digest)
1064                 crypto_ahash_init(queue->snd_hash);
1065         } else {
1066             nvme_tcp_done_send_req(queue);
1067         }
1068         return 1;
1069     }
1070     req->offset += ret;
1071 
1072     return -EAGAIN;
1073 }
1074 
1075 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1076 {
1077     struct nvme_tcp_queue *queue = req->queue;
1078     struct nvme_tcp_data_pdu *pdu = req->pdu;
1079     u8 hdgst = nvme_tcp_hdgst_len(queue);
1080     int len = sizeof(*pdu) - req->offset + hdgst;
1081     int ret;
1082 
1083     if (queue->hdr_digest && !req->offset)
1084         nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1085 
1086     if (!req->h2cdata_left)
1087         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1088                 offset_in_page(pdu) + req->offset, len,
1089                 MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1090     else
1091         ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
1092                 offset_in_page(pdu) + req->offset, len,
1093                 MSG_DONTWAIT | MSG_MORE);
1094     if (unlikely(ret <= 0))
1095         return ret;
1096 
1097     len -= ret;
1098     if (!len) {
1099         req->state = NVME_TCP_SEND_DATA;
1100         if (queue->data_digest)
1101             crypto_ahash_init(queue->snd_hash);
1102         return 1;
1103     }
1104     req->offset += ret;
1105 
1106     return -EAGAIN;
1107 }
1108 
1109 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1110 {
1111     struct nvme_tcp_queue *queue = req->queue;
1112     size_t offset = req->offset;
1113     u32 h2cdata_left = req->h2cdata_left;
1114     int ret;
1115     struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1116     struct kvec iov = {
1117         .iov_base = (u8 *)&req->ddgst + req->offset,
1118         .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1119     };
1120 
1121     if (nvme_tcp_queue_more(queue))
1122         msg.msg_flags |= MSG_MORE;
1123     else
1124         msg.msg_flags |= MSG_EOR;
1125 
1126     ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1127     if (unlikely(ret <= 0))
1128         return ret;
1129 
1130     if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1131         if (h2cdata_left)
1132             nvme_tcp_setup_h2c_data_pdu(req);
1133         else
1134             nvme_tcp_done_send_req(queue);
1135         return 1;
1136     }
1137 
1138     req->offset += ret;
1139     return -EAGAIN;
1140 }
1141 
1142 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1143 {
1144     struct nvme_tcp_request *req;
1145     int ret = 1;
1146 
1147     if (!queue->request) {
1148         queue->request = nvme_tcp_fetch_request(queue);
1149         if (!queue->request)
1150             return 0;
1151     }
1152     req = queue->request;
1153 
1154     if (req->state == NVME_TCP_SEND_CMD_PDU) {
1155         ret = nvme_tcp_try_send_cmd_pdu(req);
1156         if (ret <= 0)
1157             goto done;
1158         if (!nvme_tcp_has_inline_data(req))
1159             return ret;
1160     }
1161 
1162     if (req->state == NVME_TCP_SEND_H2C_PDU) {
1163         ret = nvme_tcp_try_send_data_pdu(req);
1164         if (ret <= 0)
1165             goto done;
1166     }
1167 
1168     if (req->state == NVME_TCP_SEND_DATA) {
1169         ret = nvme_tcp_try_send_data(req);
1170         if (ret <= 0)
1171             goto done;
1172     }
1173 
1174     if (req->state == NVME_TCP_SEND_DDGST)
1175         ret = nvme_tcp_try_send_ddgst(req);
1176 done:
1177     if (ret == -EAGAIN) {
1178         ret = 0;
1179     } else if (ret < 0) {
1180         dev_err(queue->ctrl->ctrl.device,
1181             "failed to send request %d\n", ret);
1182         nvme_tcp_fail_request(queue->request);
1183         nvme_tcp_done_send_req(queue);
1184     }
1185     return ret;
1186 }
1187 
1188 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1189 {
1190     struct socket *sock = queue->sock;
1191     struct sock *sk = sock->sk;
1192     read_descriptor_t rd_desc;
1193     int consumed;
1194 
1195     rd_desc.arg.data = queue;
1196     rd_desc.count = 1;
1197     lock_sock(sk);
1198     queue->nr_cqe = 0;
1199     consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1200     release_sock(sk);
1201     return consumed;
1202 }
1203 
1204 static void nvme_tcp_io_work(struct work_struct *w)
1205 {
1206     struct nvme_tcp_queue *queue =
1207         container_of(w, struct nvme_tcp_queue, io_work);
1208     unsigned long deadline = jiffies + msecs_to_jiffies(1);
1209 
1210     do {
1211         bool pending = false;
1212         int result;
1213 
1214         if (mutex_trylock(&queue->send_mutex)) {
1215             result = nvme_tcp_try_send(queue);
1216             mutex_unlock(&queue->send_mutex);
1217             if (result > 0)
1218                 pending = true;
1219             else if (unlikely(result < 0))
1220                 break;
1221         }
1222 
1223         result = nvme_tcp_try_recv(queue);
1224         if (result > 0)
1225             pending = true;
1226         else if (unlikely(result < 0))
1227             return;
1228 
1229         if (!pending || !queue->rd_enabled)
1230             return;
1231 
1232     } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1233 
1234     queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1235 }
1236 
1237 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1238 {
1239     struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1240 
1241     ahash_request_free(queue->rcv_hash);
1242     ahash_request_free(queue->snd_hash);
1243     crypto_free_ahash(tfm);
1244 }
1245 
1246 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1247 {
1248     struct crypto_ahash *tfm;
1249 
1250     tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1251     if (IS_ERR(tfm))
1252         return PTR_ERR(tfm);
1253 
1254     queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1255     if (!queue->snd_hash)
1256         goto free_tfm;
1257     ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1258 
1259     queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1260     if (!queue->rcv_hash)
1261         goto free_snd_hash;
1262     ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1263 
1264     return 0;
1265 free_snd_hash:
1266     ahash_request_free(queue->snd_hash);
1267 free_tfm:
1268     crypto_free_ahash(tfm);
1269     return -ENOMEM;
1270 }
1271 
1272 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1273 {
1274     struct nvme_tcp_request *async = &ctrl->async_req;
1275 
1276     page_frag_free(async->pdu);
1277 }
1278 
1279 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1280 {
1281     struct nvme_tcp_queue *queue = &ctrl->queues[0];
1282     struct nvme_tcp_request *async = &ctrl->async_req;
1283     u8 hdgst = nvme_tcp_hdgst_len(queue);
1284 
1285     async->pdu = page_frag_alloc(&queue->pf_cache,
1286         sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1287         GFP_KERNEL | __GFP_ZERO);
1288     if (!async->pdu)
1289         return -ENOMEM;
1290 
1291     async->queue = &ctrl->queues[0];
1292     return 0;
1293 }
1294 
1295 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1296 {
1297     struct page *page;
1298     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1299     struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1300 
1301     if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1302         return;
1303 
1304     if (queue->hdr_digest || queue->data_digest)
1305         nvme_tcp_free_crypto(queue);
1306 
1307     if (queue->pf_cache.va) {
1308         page = virt_to_head_page(queue->pf_cache.va);
1309         __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1310         queue->pf_cache.va = NULL;
1311     }
1312     sock_release(queue->sock);
1313     kfree(queue->pdu);
1314     mutex_destroy(&queue->send_mutex);
1315     mutex_destroy(&queue->queue_lock);
1316 }
1317 
1318 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1319 {
1320     struct nvme_tcp_icreq_pdu *icreq;
1321     struct nvme_tcp_icresp_pdu *icresp;
1322     struct msghdr msg = {};
1323     struct kvec iov;
1324     bool ctrl_hdgst, ctrl_ddgst;
1325     u32 maxh2cdata;
1326     int ret;
1327 
1328     icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1329     if (!icreq)
1330         return -ENOMEM;
1331 
1332     icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1333     if (!icresp) {
1334         ret = -ENOMEM;
1335         goto free_icreq;
1336     }
1337 
1338     icreq->hdr.type = nvme_tcp_icreq;
1339     icreq->hdr.hlen = sizeof(*icreq);
1340     icreq->hdr.pdo = 0;
1341     icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1342     icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1343     icreq->maxr2t = 0; /* single inflight r2t supported */
1344     icreq->hpda = 0; /* no alignment constraint */
1345     if (queue->hdr_digest)
1346         icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1347     if (queue->data_digest)
1348         icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1349 
1350     iov.iov_base = icreq;
1351     iov.iov_len = sizeof(*icreq);
1352     ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1353     if (ret < 0)
1354         goto free_icresp;
1355 
1356     memset(&msg, 0, sizeof(msg));
1357     iov.iov_base = icresp;
1358     iov.iov_len = sizeof(*icresp);
1359     ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1360             iov.iov_len, msg.msg_flags);
1361     if (ret < 0)
1362         goto free_icresp;
1363 
1364     ret = -EINVAL;
1365     if (icresp->hdr.type != nvme_tcp_icresp) {
1366         pr_err("queue %d: bad type returned %d\n",
1367             nvme_tcp_queue_id(queue), icresp->hdr.type);
1368         goto free_icresp;
1369     }
1370 
1371     if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1372         pr_err("queue %d: bad pdu length returned %d\n",
1373             nvme_tcp_queue_id(queue), icresp->hdr.plen);
1374         goto free_icresp;
1375     }
1376 
1377     if (icresp->pfv != NVME_TCP_PFV_1_0) {
1378         pr_err("queue %d: bad pfv returned %d\n",
1379             nvme_tcp_queue_id(queue), icresp->pfv);
1380         goto free_icresp;
1381     }
1382 
1383     ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1384     if ((queue->data_digest && !ctrl_ddgst) ||
1385         (!queue->data_digest && ctrl_ddgst)) {
1386         pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1387             nvme_tcp_queue_id(queue),
1388             queue->data_digest ? "enabled" : "disabled",
1389             ctrl_ddgst ? "enabled" : "disabled");
1390         goto free_icresp;
1391     }
1392 
1393     ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1394     if ((queue->hdr_digest && !ctrl_hdgst) ||
1395         (!queue->hdr_digest && ctrl_hdgst)) {
1396         pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1397             nvme_tcp_queue_id(queue),
1398             queue->hdr_digest ? "enabled" : "disabled",
1399             ctrl_hdgst ? "enabled" : "disabled");
1400         goto free_icresp;
1401     }
1402 
1403     if (icresp->cpda != 0) {
1404         pr_err("queue %d: unsupported cpda returned %d\n",
1405             nvme_tcp_queue_id(queue), icresp->cpda);
1406         goto free_icresp;
1407     }
1408 
1409     maxh2cdata = le32_to_cpu(icresp->maxdata);
1410     if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1411         pr_err("queue %d: invalid maxh2cdata returned %u\n",
1412                nvme_tcp_queue_id(queue), maxh2cdata);
1413         goto free_icresp;
1414     }
1415     queue->maxh2cdata = maxh2cdata;
1416 
1417     ret = 0;
1418 free_icresp:
1419     kfree(icresp);
1420 free_icreq:
1421     kfree(icreq);
1422     return ret;
1423 }
1424 
1425 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1426 {
1427     return nvme_tcp_queue_id(queue) == 0;
1428 }
1429 
1430 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1431 {
1432     struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1433     int qid = nvme_tcp_queue_id(queue);
1434 
1435     return !nvme_tcp_admin_queue(queue) &&
1436         qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1437 }
1438 
1439 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1440 {
1441     struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1442     int qid = nvme_tcp_queue_id(queue);
1443 
1444     return !nvme_tcp_admin_queue(queue) &&
1445         !nvme_tcp_default_queue(queue) &&
1446         qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1447               ctrl->io_queues[HCTX_TYPE_READ];
1448 }
1449 
1450 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1451 {
1452     struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1453     int qid = nvme_tcp_queue_id(queue);
1454 
1455     return !nvme_tcp_admin_queue(queue) &&
1456         !nvme_tcp_default_queue(queue) &&
1457         !nvme_tcp_read_queue(queue) &&
1458         qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1459               ctrl->io_queues[HCTX_TYPE_READ] +
1460               ctrl->io_queues[HCTX_TYPE_POLL];
1461 }
1462 
1463 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1464 {
1465     struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1466     int qid = nvme_tcp_queue_id(queue);
1467     int n = 0;
1468 
1469     if (nvme_tcp_default_queue(queue))
1470         n = qid - 1;
1471     else if (nvme_tcp_read_queue(queue))
1472         n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1473     else if (nvme_tcp_poll_queue(queue))
1474         n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1475                 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1476     queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1477 }
1478 
1479 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1480         int qid, size_t queue_size)
1481 {
1482     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1483     struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1484     int ret, rcv_pdu_size;
1485 
1486     mutex_init(&queue->queue_lock);
1487     queue->ctrl = ctrl;
1488     init_llist_head(&queue->req_list);
1489     INIT_LIST_HEAD(&queue->send_list);
1490     mutex_init(&queue->send_mutex);
1491     INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1492     queue->queue_size = queue_size;
1493 
1494     if (qid > 0)
1495         queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1496     else
1497         queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1498                         NVME_TCP_ADMIN_CCSZ;
1499 
1500     ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1501             IPPROTO_TCP, &queue->sock);
1502     if (ret) {
1503         dev_err(nctrl->device,
1504             "failed to create socket: %d\n", ret);
1505         goto err_destroy_mutex;
1506     }
1507 
1508     nvme_tcp_reclassify_socket(queue->sock);
1509 
1510     /* Single syn retry */
1511     tcp_sock_set_syncnt(queue->sock->sk, 1);
1512 
1513     /* Set TCP no delay */
1514     tcp_sock_set_nodelay(queue->sock->sk);
1515 
1516     /*
1517      * Cleanup whatever is sitting in the TCP transmit queue on socket
1518      * close. This is done to prevent stale data from being sent should
1519      * the network connection be restored before TCP times out.
1520      */
1521     sock_no_linger(queue->sock->sk);
1522 
1523     if (so_priority > 0)
1524         sock_set_priority(queue->sock->sk, so_priority);
1525 
1526     /* Set socket type of service */
1527     if (nctrl->opts->tos >= 0)
1528         ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1529 
1530     /* Set 10 seconds timeout for icresp recvmsg */
1531     queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1532 
1533     queue->sock->sk->sk_allocation = GFP_ATOMIC;
1534     nvme_tcp_set_queue_io_cpu(queue);
1535     queue->request = NULL;
1536     queue->data_remaining = 0;
1537     queue->ddgst_remaining = 0;
1538     queue->pdu_remaining = 0;
1539     queue->pdu_offset = 0;
1540     sk_set_memalloc(queue->sock->sk);
1541 
1542     if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1543         ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1544             sizeof(ctrl->src_addr));
1545         if (ret) {
1546             dev_err(nctrl->device,
1547                 "failed to bind queue %d socket %d\n",
1548                 qid, ret);
1549             goto err_sock;
1550         }
1551     }
1552 
1553     if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1554         char *iface = nctrl->opts->host_iface;
1555         sockptr_t optval = KERNEL_SOCKPTR(iface);
1556 
1557         ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1558                       optval, strlen(iface));
1559         if (ret) {
1560             dev_err(nctrl->device,
1561               "failed to bind to interface %s queue %d err %d\n",
1562               iface, qid, ret);
1563             goto err_sock;
1564         }
1565     }
1566 
1567     queue->hdr_digest = nctrl->opts->hdr_digest;
1568     queue->data_digest = nctrl->opts->data_digest;
1569     if (queue->hdr_digest || queue->data_digest) {
1570         ret = nvme_tcp_alloc_crypto(queue);
1571         if (ret) {
1572             dev_err(nctrl->device,
1573                 "failed to allocate queue %d crypto\n", qid);
1574             goto err_sock;
1575         }
1576     }
1577 
1578     rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1579             nvme_tcp_hdgst_len(queue);
1580     queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1581     if (!queue->pdu) {
1582         ret = -ENOMEM;
1583         goto err_crypto;
1584     }
1585 
1586     dev_dbg(nctrl->device, "connecting queue %d\n",
1587             nvme_tcp_queue_id(queue));
1588 
1589     ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1590         sizeof(ctrl->addr), 0);
1591     if (ret) {
1592         dev_err(nctrl->device,
1593             "failed to connect socket: %d\n", ret);
1594         goto err_rcv_pdu;
1595     }
1596 
1597     ret = nvme_tcp_init_connection(queue);
1598     if (ret)
1599         goto err_init_connect;
1600 
1601     queue->rd_enabled = true;
1602     set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1603     nvme_tcp_init_recv_ctx(queue);
1604 
1605     write_lock_bh(&queue->sock->sk->sk_callback_lock);
1606     queue->sock->sk->sk_user_data = queue;
1607     queue->state_change = queue->sock->sk->sk_state_change;
1608     queue->data_ready = queue->sock->sk->sk_data_ready;
1609     queue->write_space = queue->sock->sk->sk_write_space;
1610     queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1611     queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1612     queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1613 #ifdef CONFIG_NET_RX_BUSY_POLL
1614     queue->sock->sk->sk_ll_usec = 1;
1615 #endif
1616     write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1617 
1618     return 0;
1619 
1620 err_init_connect:
1621     kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1622 err_rcv_pdu:
1623     kfree(queue->pdu);
1624 err_crypto:
1625     if (queue->hdr_digest || queue->data_digest)
1626         nvme_tcp_free_crypto(queue);
1627 err_sock:
1628     sock_release(queue->sock);
1629     queue->sock = NULL;
1630 err_destroy_mutex:
1631     mutex_destroy(&queue->send_mutex);
1632     mutex_destroy(&queue->queue_lock);
1633     return ret;
1634 }
1635 
1636 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1637 {
1638     struct socket *sock = queue->sock;
1639 
1640     write_lock_bh(&sock->sk->sk_callback_lock);
1641     sock->sk->sk_user_data  = NULL;
1642     sock->sk->sk_data_ready = queue->data_ready;
1643     sock->sk->sk_state_change = queue->state_change;
1644     sock->sk->sk_write_space  = queue->write_space;
1645     write_unlock_bh(&sock->sk->sk_callback_lock);
1646 }
1647 
1648 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1649 {
1650     kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1651     nvme_tcp_restore_sock_calls(queue);
1652     cancel_work_sync(&queue->io_work);
1653 }
1654 
1655 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1656 {
1657     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1658     struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1659 
1660     if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1661         return;
1662 
1663     mutex_lock(&queue->queue_lock);
1664     if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1665         __nvme_tcp_stop_queue(queue);
1666     mutex_unlock(&queue->queue_lock);
1667 }
1668 
1669 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1670 {
1671     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1672     int ret;
1673 
1674     if (idx)
1675         ret = nvmf_connect_io_queue(nctrl, idx);
1676     else
1677         ret = nvmf_connect_admin_queue(nctrl);
1678 
1679     if (!ret) {
1680         set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1681     } else {
1682         if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1683             __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1684         dev_err(nctrl->device,
1685             "failed to connect queue: %d ret=%d\n", idx, ret);
1686     }
1687     return ret;
1688 }
1689 
1690 static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
1691 {
1692     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1693     struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
1694     int ret;
1695 
1696     memset(set, 0, sizeof(*set));
1697     set->ops = &nvme_tcp_admin_mq_ops;
1698     set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1699     set->reserved_tags = NVMF_RESERVED_TAGS;
1700     set->numa_node = nctrl->numa_node;
1701     set->flags = BLK_MQ_F_BLOCKING;
1702     set->cmd_size = sizeof(struct nvme_tcp_request);
1703     set->driver_data = ctrl;
1704     set->nr_hw_queues = 1;
1705     set->timeout = NVME_ADMIN_TIMEOUT;
1706     ret = blk_mq_alloc_tag_set(set);
1707     if (!ret)
1708         nctrl->admin_tagset = set;
1709     return ret;
1710 }
1711 
1712 static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl)
1713 {
1714     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1715     struct blk_mq_tag_set *set = &ctrl->tag_set;
1716     int ret;
1717 
1718     memset(set, 0, sizeof(*set));
1719     set->ops = &nvme_tcp_mq_ops;
1720     set->queue_depth = nctrl->sqsize + 1;
1721     set->reserved_tags = NVMF_RESERVED_TAGS;
1722     set->numa_node = nctrl->numa_node;
1723     set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1724     set->cmd_size = sizeof(struct nvme_tcp_request);
1725     set->driver_data = ctrl;
1726     set->nr_hw_queues = nctrl->queue_count - 1;
1727     set->timeout = NVME_IO_TIMEOUT;
1728     set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1729     ret = blk_mq_alloc_tag_set(set);
1730     if (!ret)
1731         nctrl->tagset = set;
1732     return ret;
1733 }
1734 
1735 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1736 {
1737     if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1738         cancel_work_sync(&ctrl->async_event_work);
1739         nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1740         to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1741     }
1742 
1743     nvme_tcp_free_queue(ctrl, 0);
1744 }
1745 
1746 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1747 {
1748     int i;
1749 
1750     for (i = 1; i < ctrl->queue_count; i++)
1751         nvme_tcp_free_queue(ctrl, i);
1752 }
1753 
1754 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1755 {
1756     int i;
1757 
1758     for (i = 1; i < ctrl->queue_count; i++)
1759         nvme_tcp_stop_queue(ctrl, i);
1760 }
1761 
1762 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1763 {
1764     int i, ret;
1765 
1766     for (i = 1; i < ctrl->queue_count; i++) {
1767         ret = nvme_tcp_start_queue(ctrl, i);
1768         if (ret)
1769             goto out_stop_queues;
1770     }
1771 
1772     return 0;
1773 
1774 out_stop_queues:
1775     for (i--; i >= 1; i--)
1776         nvme_tcp_stop_queue(ctrl, i);
1777     return ret;
1778 }
1779 
1780 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1781 {
1782     int ret;
1783 
1784     ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1785     if (ret)
1786         return ret;
1787 
1788     ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1789     if (ret)
1790         goto out_free_queue;
1791 
1792     return 0;
1793 
1794 out_free_queue:
1795     nvme_tcp_free_queue(ctrl, 0);
1796     return ret;
1797 }
1798 
1799 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1800 {
1801     int i, ret;
1802 
1803     for (i = 1; i < ctrl->queue_count; i++) {
1804         ret = nvme_tcp_alloc_queue(ctrl, i, ctrl->sqsize + 1);
1805         if (ret)
1806             goto out_free_queues;
1807     }
1808 
1809     return 0;
1810 
1811 out_free_queues:
1812     for (i--; i >= 1; i--)
1813         nvme_tcp_free_queue(ctrl, i);
1814 
1815     return ret;
1816 }
1817 
1818 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1819 {
1820     unsigned int nr_io_queues;
1821 
1822     nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1823     nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1824     nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1825 
1826     return nr_io_queues;
1827 }
1828 
1829 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1830         unsigned int nr_io_queues)
1831 {
1832     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1833     struct nvmf_ctrl_options *opts = nctrl->opts;
1834 
1835     if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1836         /*
1837          * separate read/write queues
1838          * hand out dedicated default queues only after we have
1839          * sufficient read queues.
1840          */
1841         ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1842         nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1843         ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1844             min(opts->nr_write_queues, nr_io_queues);
1845         nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1846     } else {
1847         /*
1848          * shared read/write queues
1849          * either no write queues were requested, or we don't have
1850          * sufficient queue count to have dedicated default queues.
1851          */
1852         ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1853             min(opts->nr_io_queues, nr_io_queues);
1854         nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1855     }
1856 
1857     if (opts->nr_poll_queues && nr_io_queues) {
1858         /* map dedicated poll queues only if we have queues left */
1859         ctrl->io_queues[HCTX_TYPE_POLL] =
1860             min(opts->nr_poll_queues, nr_io_queues);
1861     }
1862 }
1863 
1864 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1865 {
1866     unsigned int nr_io_queues;
1867     int ret;
1868 
1869     nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1870     ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1871     if (ret)
1872         return ret;
1873 
1874     if (nr_io_queues == 0) {
1875         dev_err(ctrl->device,
1876             "unable to set any I/O queues\n");
1877         return -ENOMEM;
1878     }
1879 
1880     ctrl->queue_count = nr_io_queues + 1;
1881     dev_info(ctrl->device,
1882         "creating %d I/O queues.\n", nr_io_queues);
1883 
1884     nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1885 
1886     return __nvme_tcp_alloc_io_queues(ctrl);
1887 }
1888 
1889 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1890 {
1891     nvme_tcp_stop_io_queues(ctrl);
1892     if (remove) {
1893         blk_mq_destroy_queue(ctrl->connect_q);
1894         blk_mq_free_tag_set(ctrl->tagset);
1895     }
1896     nvme_tcp_free_io_queues(ctrl);
1897 }
1898 
1899 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1900 {
1901     int ret;
1902 
1903     ret = nvme_tcp_alloc_io_queues(ctrl);
1904     if (ret)
1905         return ret;
1906 
1907     if (new) {
1908         ret = nvme_tcp_alloc_tag_set(ctrl);
1909         if (ret)
1910             goto out_free_io_queues;
1911 
1912         ret = nvme_ctrl_init_connect_q(ctrl);
1913         if (ret)
1914             goto out_free_tag_set;
1915     }
1916 
1917     ret = nvme_tcp_start_io_queues(ctrl);
1918     if (ret)
1919         goto out_cleanup_connect_q;
1920 
1921     if (!new) {
1922         nvme_start_queues(ctrl);
1923         if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1924             /*
1925              * If we timed out waiting for freeze we are likely to
1926              * be stuck.  Fail the controller initialization just
1927              * to be safe.
1928              */
1929             ret = -ENODEV;
1930             goto out_wait_freeze_timed_out;
1931         }
1932         blk_mq_update_nr_hw_queues(ctrl->tagset,
1933             ctrl->queue_count - 1);
1934         nvme_unfreeze(ctrl);
1935     }
1936 
1937     return 0;
1938 
1939 out_wait_freeze_timed_out:
1940     nvme_stop_queues(ctrl);
1941     nvme_sync_io_queues(ctrl);
1942     nvme_tcp_stop_io_queues(ctrl);
1943 out_cleanup_connect_q:
1944     nvme_cancel_tagset(ctrl);
1945     if (new)
1946         blk_mq_destroy_queue(ctrl->connect_q);
1947 out_free_tag_set:
1948     if (new)
1949         blk_mq_free_tag_set(ctrl->tagset);
1950 out_free_io_queues:
1951     nvme_tcp_free_io_queues(ctrl);
1952     return ret;
1953 }
1954 
1955 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1956 {
1957     nvme_tcp_stop_queue(ctrl, 0);
1958     if (remove) {
1959         blk_mq_destroy_queue(ctrl->admin_q);
1960         blk_mq_destroy_queue(ctrl->fabrics_q);
1961         blk_mq_free_tag_set(ctrl->admin_tagset);
1962     }
1963     nvme_tcp_free_admin_queue(ctrl);
1964 }
1965 
1966 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1967 {
1968     int error;
1969 
1970     error = nvme_tcp_alloc_admin_queue(ctrl);
1971     if (error)
1972         return error;
1973 
1974     if (new) {
1975         error = nvme_tcp_alloc_admin_tag_set(ctrl);
1976         if (error)
1977             goto out_free_queue;
1978 
1979         ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1980         if (IS_ERR(ctrl->fabrics_q)) {
1981             error = PTR_ERR(ctrl->fabrics_q);
1982             goto out_free_tagset;
1983         }
1984 
1985         ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1986         if (IS_ERR(ctrl->admin_q)) {
1987             error = PTR_ERR(ctrl->admin_q);
1988             goto out_cleanup_fabrics_q;
1989         }
1990     }
1991 
1992     error = nvme_tcp_start_queue(ctrl, 0);
1993     if (error)
1994         goto out_cleanup_queue;
1995 
1996     error = nvme_enable_ctrl(ctrl);
1997     if (error)
1998         goto out_stop_queue;
1999 
2000     nvme_start_admin_queue(ctrl);
2001 
2002     error = nvme_init_ctrl_finish(ctrl);
2003     if (error)
2004         goto out_quiesce_queue;
2005 
2006     return 0;
2007 
2008 out_quiesce_queue:
2009     nvme_stop_admin_queue(ctrl);
2010     blk_sync_queue(ctrl->admin_q);
2011 out_stop_queue:
2012     nvme_tcp_stop_queue(ctrl, 0);
2013     nvme_cancel_admin_tagset(ctrl);
2014 out_cleanup_queue:
2015     if (new)
2016         blk_mq_destroy_queue(ctrl->admin_q);
2017 out_cleanup_fabrics_q:
2018     if (new)
2019         blk_mq_destroy_queue(ctrl->fabrics_q);
2020 out_free_tagset:
2021     if (new)
2022         blk_mq_free_tag_set(ctrl->admin_tagset);
2023 out_free_queue:
2024     nvme_tcp_free_admin_queue(ctrl);
2025     return error;
2026 }
2027 
2028 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
2029         bool remove)
2030 {
2031     nvme_stop_admin_queue(ctrl);
2032     blk_sync_queue(ctrl->admin_q);
2033     nvme_tcp_stop_queue(ctrl, 0);
2034     nvme_cancel_admin_tagset(ctrl);
2035     if (remove)
2036         nvme_start_admin_queue(ctrl);
2037     nvme_tcp_destroy_admin_queue(ctrl, remove);
2038 }
2039 
2040 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
2041         bool remove)
2042 {
2043     if (ctrl->queue_count <= 1)
2044         return;
2045     nvme_stop_admin_queue(ctrl);
2046     nvme_start_freeze(ctrl);
2047     nvme_stop_queues(ctrl);
2048     nvme_sync_io_queues(ctrl);
2049     nvme_tcp_stop_io_queues(ctrl);
2050     nvme_cancel_tagset(ctrl);
2051     if (remove)
2052         nvme_start_queues(ctrl);
2053     nvme_tcp_destroy_io_queues(ctrl, remove);
2054 }
2055 
2056 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
2057 {
2058     /* If we are resetting/deleting then do nothing */
2059     if (ctrl->state != NVME_CTRL_CONNECTING) {
2060         WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
2061             ctrl->state == NVME_CTRL_LIVE);
2062         return;
2063     }
2064 
2065     if (nvmf_should_reconnect(ctrl)) {
2066         dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2067             ctrl->opts->reconnect_delay);
2068         queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
2069                 ctrl->opts->reconnect_delay * HZ);
2070     } else {
2071         dev_info(ctrl->device, "Removing controller...\n");
2072         nvme_delete_ctrl(ctrl);
2073     }
2074 }
2075 
2076 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2077 {
2078     struct nvmf_ctrl_options *opts = ctrl->opts;
2079     int ret;
2080 
2081     ret = nvme_tcp_configure_admin_queue(ctrl, new);
2082     if (ret)
2083         return ret;
2084 
2085     if (ctrl->icdoff) {
2086         ret = -EOPNOTSUPP;
2087         dev_err(ctrl->device, "icdoff is not supported!\n");
2088         goto destroy_admin;
2089     }
2090 
2091     if (!nvme_ctrl_sgl_supported(ctrl)) {
2092         ret = -EOPNOTSUPP;
2093         dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2094         goto destroy_admin;
2095     }
2096 
2097     if (opts->queue_size > ctrl->sqsize + 1)
2098         dev_warn(ctrl->device,
2099             "queue_size %zu > ctrl sqsize %u, clamping down\n",
2100             opts->queue_size, ctrl->sqsize + 1);
2101 
2102     if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2103         dev_warn(ctrl->device,
2104             "sqsize %u > ctrl maxcmd %u, clamping down\n",
2105             ctrl->sqsize + 1, ctrl->maxcmd);
2106         ctrl->sqsize = ctrl->maxcmd - 1;
2107     }
2108 
2109     if (ctrl->queue_count > 1) {
2110         ret = nvme_tcp_configure_io_queues(ctrl, new);
2111         if (ret)
2112             goto destroy_admin;
2113     }
2114 
2115     if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2116         /*
2117          * state change failure is ok if we started ctrl delete,
2118          * unless we're during creation of a new controller to
2119          * avoid races with teardown flow.
2120          */
2121         WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2122                  ctrl->state != NVME_CTRL_DELETING_NOIO);
2123         WARN_ON_ONCE(new);
2124         ret = -EINVAL;
2125         goto destroy_io;
2126     }
2127 
2128     nvme_start_ctrl(ctrl);
2129     return 0;
2130 
2131 destroy_io:
2132     if (ctrl->queue_count > 1) {
2133         nvme_stop_queues(ctrl);
2134         nvme_sync_io_queues(ctrl);
2135         nvme_tcp_stop_io_queues(ctrl);
2136         nvme_cancel_tagset(ctrl);
2137         nvme_tcp_destroy_io_queues(ctrl, new);
2138     }
2139 destroy_admin:
2140     nvme_stop_admin_queue(ctrl);
2141     blk_sync_queue(ctrl->admin_q);
2142     nvme_tcp_stop_queue(ctrl, 0);
2143     nvme_cancel_admin_tagset(ctrl);
2144     nvme_tcp_destroy_admin_queue(ctrl, new);
2145     return ret;
2146 }
2147 
2148 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2149 {
2150     struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2151             struct nvme_tcp_ctrl, connect_work);
2152     struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2153 
2154     ++ctrl->nr_reconnects;
2155 
2156     if (nvme_tcp_setup_ctrl(ctrl, false))
2157         goto requeue;
2158 
2159     dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2160             ctrl->nr_reconnects);
2161 
2162     ctrl->nr_reconnects = 0;
2163 
2164     return;
2165 
2166 requeue:
2167     dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2168             ctrl->nr_reconnects);
2169     nvme_tcp_reconnect_or_remove(ctrl);
2170 }
2171 
2172 static void nvme_tcp_error_recovery_work(struct work_struct *work)
2173 {
2174     struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2175                 struct nvme_tcp_ctrl, err_work);
2176     struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2177 
2178     nvme_auth_stop(ctrl);
2179     nvme_stop_keep_alive(ctrl);
2180     flush_work(&ctrl->async_event_work);
2181     nvme_tcp_teardown_io_queues(ctrl, false);
2182     /* unquiesce to fail fast pending requests */
2183     nvme_start_queues(ctrl);
2184     nvme_tcp_teardown_admin_queue(ctrl, false);
2185     nvme_start_admin_queue(ctrl);
2186 
2187     if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2188         /* state change failure is ok if we started ctrl delete */
2189         WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2190                  ctrl->state != NVME_CTRL_DELETING_NOIO);
2191         return;
2192     }
2193 
2194     nvme_tcp_reconnect_or_remove(ctrl);
2195 }
2196 
2197 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2198 {
2199     nvme_tcp_teardown_io_queues(ctrl, shutdown);
2200     nvme_stop_admin_queue(ctrl);
2201     if (shutdown)
2202         nvme_shutdown_ctrl(ctrl);
2203     else
2204         nvme_disable_ctrl(ctrl);
2205     nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2206 }
2207 
2208 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2209 {
2210     nvme_tcp_teardown_ctrl(ctrl, true);
2211 }
2212 
2213 static void nvme_reset_ctrl_work(struct work_struct *work)
2214 {
2215     struct nvme_ctrl *ctrl =
2216         container_of(work, struct nvme_ctrl, reset_work);
2217 
2218     nvme_stop_ctrl(ctrl);
2219     nvme_tcp_teardown_ctrl(ctrl, false);
2220 
2221     if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2222         /* state change failure is ok if we started ctrl delete */
2223         WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2224                  ctrl->state != NVME_CTRL_DELETING_NOIO);
2225         return;
2226     }
2227 
2228     if (nvme_tcp_setup_ctrl(ctrl, false))
2229         goto out_fail;
2230 
2231     return;
2232 
2233 out_fail:
2234     ++ctrl->nr_reconnects;
2235     nvme_tcp_reconnect_or_remove(ctrl);
2236 }
2237 
2238 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2239 {
2240     cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2241     cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2242 }
2243 
2244 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2245 {
2246     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2247 
2248     if (list_empty(&ctrl->list))
2249         goto free_ctrl;
2250 
2251     mutex_lock(&nvme_tcp_ctrl_mutex);
2252     list_del(&ctrl->list);
2253     mutex_unlock(&nvme_tcp_ctrl_mutex);
2254 
2255     nvmf_free_options(nctrl->opts);
2256 free_ctrl:
2257     kfree(ctrl->queues);
2258     kfree(ctrl);
2259 }
2260 
2261 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2262 {
2263     struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2264 
2265     sg->addr = 0;
2266     sg->length = 0;
2267     sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2268             NVME_SGL_FMT_TRANSPORT_A;
2269 }
2270 
2271 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2272         struct nvme_command *c, u32 data_len)
2273 {
2274     struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2275 
2276     sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2277     sg->length = cpu_to_le32(data_len);
2278     sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2279 }
2280 
2281 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2282         u32 data_len)
2283 {
2284     struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2285 
2286     sg->addr = 0;
2287     sg->length = cpu_to_le32(data_len);
2288     sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2289             NVME_SGL_FMT_TRANSPORT_A;
2290 }
2291 
2292 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2293 {
2294     struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2295     struct nvme_tcp_queue *queue = &ctrl->queues[0];
2296     struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2297     struct nvme_command *cmd = &pdu->cmd;
2298     u8 hdgst = nvme_tcp_hdgst_len(queue);
2299 
2300     memset(pdu, 0, sizeof(*pdu));
2301     pdu->hdr.type = nvme_tcp_cmd;
2302     if (queue->hdr_digest)
2303         pdu->hdr.flags |= NVME_TCP_F_HDGST;
2304     pdu->hdr.hlen = sizeof(*pdu);
2305     pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2306 
2307     cmd->common.opcode = nvme_admin_async_event;
2308     cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2309     cmd->common.flags |= NVME_CMD_SGL_METABUF;
2310     nvme_tcp_set_sg_null(cmd);
2311 
2312     ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2313     ctrl->async_req.offset = 0;
2314     ctrl->async_req.curr_bio = NULL;
2315     ctrl->async_req.data_len = 0;
2316 
2317     nvme_tcp_queue_request(&ctrl->async_req, true, true);
2318 }
2319 
2320 static void nvme_tcp_complete_timed_out(struct request *rq)
2321 {
2322     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2323     struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2324 
2325     nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2326     nvmf_complete_timed_out_request(rq);
2327 }
2328 
2329 static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
2330 {
2331     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2332     struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2333     struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2334 
2335     dev_warn(ctrl->device,
2336         "queue %d: timeout request %#x type %d\n",
2337         nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2338 
2339     if (ctrl->state != NVME_CTRL_LIVE) {
2340         /*
2341          * If we are resetting, connecting or deleting we should
2342          * complete immediately because we may block controller
2343          * teardown or setup sequence
2344          * - ctrl disable/shutdown fabrics requests
2345          * - connect requests
2346          * - initialization admin requests
2347          * - I/O requests that entered after unquiescing and
2348          *   the controller stopped responding
2349          *
2350          * All other requests should be cancelled by the error
2351          * recovery work, so it's fine that we fail it here.
2352          */
2353         nvme_tcp_complete_timed_out(rq);
2354         return BLK_EH_DONE;
2355     }
2356 
2357     /*
2358      * LIVE state should trigger the normal error recovery which will
2359      * handle completing this request.
2360      */
2361     nvme_tcp_error_recovery(ctrl);
2362     return BLK_EH_RESET_TIMER;
2363 }
2364 
2365 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2366             struct request *rq)
2367 {
2368     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2369     struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2370     struct nvme_command *c = &pdu->cmd;
2371 
2372     c->common.flags |= NVME_CMD_SGL_METABUF;
2373 
2374     if (!blk_rq_nr_phys_segments(rq))
2375         nvme_tcp_set_sg_null(c);
2376     else if (rq_data_dir(rq) == WRITE &&
2377         req->data_len <= nvme_tcp_inline_data_size(req))
2378         nvme_tcp_set_sg_inline(queue, c, req->data_len);
2379     else
2380         nvme_tcp_set_sg_host_data(c, req->data_len);
2381 
2382     return 0;
2383 }
2384 
2385 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2386         struct request *rq)
2387 {
2388     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2389     struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2390     struct nvme_tcp_queue *queue = req->queue;
2391     u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2392     blk_status_t ret;
2393 
2394     ret = nvme_setup_cmd(ns, rq);
2395     if (ret)
2396         return ret;
2397 
2398     req->state = NVME_TCP_SEND_CMD_PDU;
2399     req->status = cpu_to_le16(NVME_SC_SUCCESS);
2400     req->offset = 0;
2401     req->data_sent = 0;
2402     req->pdu_len = 0;
2403     req->pdu_sent = 0;
2404     req->h2cdata_left = 0;
2405     req->data_len = blk_rq_nr_phys_segments(rq) ?
2406                 blk_rq_payload_bytes(rq) : 0;
2407     req->curr_bio = rq->bio;
2408     if (req->curr_bio && req->data_len)
2409         nvme_tcp_init_iter(req, rq_data_dir(rq));
2410 
2411     if (rq_data_dir(rq) == WRITE &&
2412         req->data_len <= nvme_tcp_inline_data_size(req))
2413         req->pdu_len = req->data_len;
2414 
2415     pdu->hdr.type = nvme_tcp_cmd;
2416     pdu->hdr.flags = 0;
2417     if (queue->hdr_digest)
2418         pdu->hdr.flags |= NVME_TCP_F_HDGST;
2419     if (queue->data_digest && req->pdu_len) {
2420         pdu->hdr.flags |= NVME_TCP_F_DDGST;
2421         ddgst = nvme_tcp_ddgst_len(queue);
2422     }
2423     pdu->hdr.hlen = sizeof(*pdu);
2424     pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2425     pdu->hdr.plen =
2426         cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2427 
2428     ret = nvme_tcp_map_data(queue, rq);
2429     if (unlikely(ret)) {
2430         nvme_cleanup_cmd(rq);
2431         dev_err(queue->ctrl->ctrl.device,
2432             "Failed to map data (%d)\n", ret);
2433         return ret;
2434     }
2435 
2436     return 0;
2437 }
2438 
2439 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2440 {
2441     struct nvme_tcp_queue *queue = hctx->driver_data;
2442 
2443     if (!llist_empty(&queue->req_list))
2444         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2445 }
2446 
2447 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2448         const struct blk_mq_queue_data *bd)
2449 {
2450     struct nvme_ns *ns = hctx->queue->queuedata;
2451     struct nvme_tcp_queue *queue = hctx->driver_data;
2452     struct request *rq = bd->rq;
2453     struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2454     bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2455     blk_status_t ret;
2456 
2457     if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2458         return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2459 
2460     ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2461     if (unlikely(ret))
2462         return ret;
2463 
2464     blk_mq_start_request(rq);
2465 
2466     nvme_tcp_queue_request(req, true, bd->last);
2467 
2468     return BLK_STS_OK;
2469 }
2470 
2471 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2472 {
2473     struct nvme_tcp_ctrl *ctrl = set->driver_data;
2474     struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2475 
2476     if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2477         /* separate read/write queues */
2478         set->map[HCTX_TYPE_DEFAULT].nr_queues =
2479             ctrl->io_queues[HCTX_TYPE_DEFAULT];
2480         set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2481         set->map[HCTX_TYPE_READ].nr_queues =
2482             ctrl->io_queues[HCTX_TYPE_READ];
2483         set->map[HCTX_TYPE_READ].queue_offset =
2484             ctrl->io_queues[HCTX_TYPE_DEFAULT];
2485     } else {
2486         /* shared read/write queues */
2487         set->map[HCTX_TYPE_DEFAULT].nr_queues =
2488             ctrl->io_queues[HCTX_TYPE_DEFAULT];
2489         set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2490         set->map[HCTX_TYPE_READ].nr_queues =
2491             ctrl->io_queues[HCTX_TYPE_DEFAULT];
2492         set->map[HCTX_TYPE_READ].queue_offset = 0;
2493     }
2494     blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2495     blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2496 
2497     if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2498         /* map dedicated poll queues only if we have queues left */
2499         set->map[HCTX_TYPE_POLL].nr_queues =
2500                 ctrl->io_queues[HCTX_TYPE_POLL];
2501         set->map[HCTX_TYPE_POLL].queue_offset =
2502             ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2503             ctrl->io_queues[HCTX_TYPE_READ];
2504         blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2505     }
2506 
2507     dev_info(ctrl->ctrl.device,
2508         "mapped %d/%d/%d default/read/poll queues.\n",
2509         ctrl->io_queues[HCTX_TYPE_DEFAULT],
2510         ctrl->io_queues[HCTX_TYPE_READ],
2511         ctrl->io_queues[HCTX_TYPE_POLL]);
2512 
2513     return 0;
2514 }
2515 
2516 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2517 {
2518     struct nvme_tcp_queue *queue = hctx->driver_data;
2519     struct sock *sk = queue->sock->sk;
2520 
2521     if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2522         return 0;
2523 
2524     set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2525     if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2526         sk_busy_loop(sk, true);
2527     nvme_tcp_try_recv(queue);
2528     clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2529     return queue->nr_cqe;
2530 }
2531 
2532 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2533     .queue_rq   = nvme_tcp_queue_rq,
2534     .commit_rqs = nvme_tcp_commit_rqs,
2535     .complete   = nvme_complete_rq,
2536     .init_request   = nvme_tcp_init_request,
2537     .exit_request   = nvme_tcp_exit_request,
2538     .init_hctx  = nvme_tcp_init_hctx,
2539     .timeout    = nvme_tcp_timeout,
2540     .map_queues = nvme_tcp_map_queues,
2541     .poll       = nvme_tcp_poll,
2542 };
2543 
2544 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2545     .queue_rq   = nvme_tcp_queue_rq,
2546     .complete   = nvme_complete_rq,
2547     .init_request   = nvme_tcp_init_request,
2548     .exit_request   = nvme_tcp_exit_request,
2549     .init_hctx  = nvme_tcp_init_admin_hctx,
2550     .timeout    = nvme_tcp_timeout,
2551 };
2552 
2553 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2554     .name           = "tcp",
2555     .module         = THIS_MODULE,
2556     .flags          = NVME_F_FABRICS,
2557     .reg_read32     = nvmf_reg_read32,
2558     .reg_read64     = nvmf_reg_read64,
2559     .reg_write32        = nvmf_reg_write32,
2560     .free_ctrl      = nvme_tcp_free_ctrl,
2561     .submit_async_event = nvme_tcp_submit_async_event,
2562     .delete_ctrl        = nvme_tcp_delete_ctrl,
2563     .get_address        = nvmf_get_address,
2564     .stop_ctrl      = nvme_tcp_stop_ctrl,
2565 };
2566 
2567 static bool
2568 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2569 {
2570     struct nvme_tcp_ctrl *ctrl;
2571     bool found = false;
2572 
2573     mutex_lock(&nvme_tcp_ctrl_mutex);
2574     list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2575         found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2576         if (found)
2577             break;
2578     }
2579     mutex_unlock(&nvme_tcp_ctrl_mutex);
2580 
2581     return found;
2582 }
2583 
2584 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2585         struct nvmf_ctrl_options *opts)
2586 {
2587     struct nvme_tcp_ctrl *ctrl;
2588     int ret;
2589 
2590     ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2591     if (!ctrl)
2592         return ERR_PTR(-ENOMEM);
2593 
2594     INIT_LIST_HEAD(&ctrl->list);
2595     ctrl->ctrl.opts = opts;
2596     ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2597                 opts->nr_poll_queues + 1;
2598     ctrl->ctrl.sqsize = opts->queue_size - 1;
2599     ctrl->ctrl.kato = opts->kato;
2600 
2601     INIT_DELAYED_WORK(&ctrl->connect_work,
2602             nvme_tcp_reconnect_ctrl_work);
2603     INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2604     INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2605 
2606     if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2607         opts->trsvcid =
2608             kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2609         if (!opts->trsvcid) {
2610             ret = -ENOMEM;
2611             goto out_free_ctrl;
2612         }
2613         opts->mask |= NVMF_OPT_TRSVCID;
2614     }
2615 
2616     ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2617             opts->traddr, opts->trsvcid, &ctrl->addr);
2618     if (ret) {
2619         pr_err("malformed address passed: %s:%s\n",
2620             opts->traddr, opts->trsvcid);
2621         goto out_free_ctrl;
2622     }
2623 
2624     if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2625         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2626             opts->host_traddr, NULL, &ctrl->src_addr);
2627         if (ret) {
2628             pr_err("malformed src address passed: %s\n",
2629                    opts->host_traddr);
2630             goto out_free_ctrl;
2631         }
2632     }
2633 
2634     if (opts->mask & NVMF_OPT_HOST_IFACE) {
2635         if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2636             pr_err("invalid interface passed: %s\n",
2637                    opts->host_iface);
2638             ret = -ENODEV;
2639             goto out_free_ctrl;
2640         }
2641     }
2642 
2643     if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2644         ret = -EALREADY;
2645         goto out_free_ctrl;
2646     }
2647 
2648     ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2649                 GFP_KERNEL);
2650     if (!ctrl->queues) {
2651         ret = -ENOMEM;
2652         goto out_free_ctrl;
2653     }
2654 
2655     ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2656     if (ret)
2657         goto out_kfree_queues;
2658 
2659     if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2660         WARN_ON_ONCE(1);
2661         ret = -EINTR;
2662         goto out_uninit_ctrl;
2663     }
2664 
2665     ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2666     if (ret)
2667         goto out_uninit_ctrl;
2668 
2669     dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2670         nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
2671 
2672     mutex_lock(&nvme_tcp_ctrl_mutex);
2673     list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2674     mutex_unlock(&nvme_tcp_ctrl_mutex);
2675 
2676     return &ctrl->ctrl;
2677 
2678 out_uninit_ctrl:
2679     nvme_uninit_ctrl(&ctrl->ctrl);
2680     nvme_put_ctrl(&ctrl->ctrl);
2681     if (ret > 0)
2682         ret = -EIO;
2683     return ERR_PTR(ret);
2684 out_kfree_queues:
2685     kfree(ctrl->queues);
2686 out_free_ctrl:
2687     kfree(ctrl);
2688     return ERR_PTR(ret);
2689 }
2690 
2691 static struct nvmf_transport_ops nvme_tcp_transport = {
2692     .name       = "tcp",
2693     .module     = THIS_MODULE,
2694     .required_opts  = NVMF_OPT_TRADDR,
2695     .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2696               NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2697               NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2698               NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2699               NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2700     .create_ctrl    = nvme_tcp_create_ctrl,
2701 };
2702 
2703 static int __init nvme_tcp_init_module(void)
2704 {
2705     nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2706             WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2707     if (!nvme_tcp_wq)
2708         return -ENOMEM;
2709 
2710     nvmf_register_transport(&nvme_tcp_transport);
2711     return 0;
2712 }
2713 
2714 static void __exit nvme_tcp_cleanup_module(void)
2715 {
2716     struct nvme_tcp_ctrl *ctrl;
2717 
2718     nvmf_unregister_transport(&nvme_tcp_transport);
2719 
2720     mutex_lock(&nvme_tcp_ctrl_mutex);
2721     list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2722         nvme_delete_ctrl(&ctrl->ctrl);
2723     mutex_unlock(&nvme_tcp_ctrl_mutex);
2724     flush_workqueue(nvme_delete_wq);
2725 
2726     destroy_workqueue(nvme_tcp_wq);
2727 }
2728 
2729 module_init(nvme_tcp_init_module);
2730 module_exit(nvme_tcp_cleanup_module);
2731 
2732 MODULE_LICENSE("GPL v2");