Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
0003  *
0004  * This software is available to you under a choice of one of two
0005  * licenses.  You may choose to be licensed under the terms of the GNU
0006  * General Public License (GPL) Version 2, available from the file
0007  * COPYING in the main directory of this source tree, or the
0008  * OpenIB.org BSD license below:
0009  *
0010  *     Redistribution and use in source and binary forms, with or
0011  *     without modification, are permitted provided that the following
0012  *     conditions are met:
0013  *
0014  *      - Redistributions of source code must retain the above
0015  *        copyright notice, this list of conditions and the following
0016  *        disclaimer.
0017  *
0018  *      - Redistributions in binary form must reproduce the above
0019  *        copyright notice, this list of conditions and the following
0020  *        disclaimer in the documentation and/or other materials
0021  *        provided with the distribution.
0022  *
0023  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0024  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0025  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0026  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0027  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0028  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0029  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0030  * SOFTWARE.
0031  *
0032  */
0033 #include <linux/kernel.h>
0034 #include <linux/in.h>
0035 #include <net/tcp.h>
0036 
0037 #include "rds.h"
0038 #include "tcp.h"
0039 
0040 void rds_tcp_state_change(struct sock *sk)
0041 {
0042     void (*state_change)(struct sock *sk);
0043     struct rds_conn_path *cp;
0044     struct rds_tcp_connection *tc;
0045 
0046     read_lock_bh(&sk->sk_callback_lock);
0047     cp = sk->sk_user_data;
0048     if (!cp) {
0049         state_change = sk->sk_state_change;
0050         goto out;
0051     }
0052     tc = cp->cp_transport_data;
0053     state_change = tc->t_orig_state_change;
0054 
0055     rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
0056 
0057     switch (sk->sk_state) {
0058     /* ignore connecting sockets as they make progress */
0059     case TCP_SYN_SENT:
0060     case TCP_SYN_RECV:
0061         break;
0062     case TCP_ESTABLISHED:
0063         /* Force the peer to reconnect so that we have the
0064          * TCP ports going from <smaller-ip>.<transient> to
0065          * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
0066          * RDS connection as RDS_CONN_UP until the reconnect,
0067          * to avoid RDS datagram loss.
0068          */
0069         if (rds_addr_cmp(&cp->cp_conn->c_laddr,
0070                  &cp->cp_conn->c_faddr) >= 0 &&
0071             rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
0072                          RDS_CONN_ERROR)) {
0073             rds_conn_path_drop(cp, false);
0074         } else {
0075             rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
0076         }
0077         break;
0078     case TCP_CLOSE_WAIT:
0079     case TCP_CLOSE:
0080         rds_conn_path_drop(cp, false);
0081         break;
0082     default:
0083         break;
0084     }
0085 out:
0086     read_unlock_bh(&sk->sk_callback_lock);
0087     state_change(sk);
0088 }
0089 
0090 int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
0091 {
0092     struct socket *sock = NULL;
0093     struct sockaddr_in6 sin6;
0094     struct sockaddr_in sin;
0095     struct sockaddr *addr;
0096     int addrlen;
0097     bool isv6;
0098     int ret;
0099     struct rds_connection *conn = cp->cp_conn;
0100     struct rds_tcp_connection *tc = cp->cp_transport_data;
0101 
0102     /* for multipath rds,we only trigger the connection after
0103      * the handshake probe has determined the number of paths.
0104      */
0105     if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
0106         return -EAGAIN;
0107 
0108     mutex_lock(&tc->t_conn_path_lock);
0109 
0110     if (rds_conn_path_up(cp)) {
0111         mutex_unlock(&tc->t_conn_path_lock);
0112         return 0;
0113     }
0114     if (ipv6_addr_v4mapped(&conn->c_laddr)) {
0115         ret = sock_create_kern(rds_conn_net(conn), PF_INET,
0116                        SOCK_STREAM, IPPROTO_TCP, &sock);
0117         isv6 = false;
0118     } else {
0119         ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
0120                        SOCK_STREAM, IPPROTO_TCP, &sock);
0121         isv6 = true;
0122     }
0123 
0124     if (ret < 0)
0125         goto out;
0126 
0127     if (!rds_tcp_tune(sock)) {
0128         ret = -EINVAL;
0129         goto out;
0130     }
0131 
0132     if (isv6) {
0133         sin6.sin6_family = AF_INET6;
0134         sin6.sin6_addr = conn->c_laddr;
0135         sin6.sin6_port = 0;
0136         sin6.sin6_flowinfo = 0;
0137         sin6.sin6_scope_id = conn->c_dev_if;
0138         addr = (struct sockaddr *)&sin6;
0139         addrlen = sizeof(sin6);
0140     } else {
0141         sin.sin_family = AF_INET;
0142         sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
0143         sin.sin_port = 0;
0144         addr = (struct sockaddr *)&sin;
0145         addrlen = sizeof(sin);
0146     }
0147 
0148     ret = sock->ops->bind(sock, addr, addrlen);
0149     if (ret) {
0150         rdsdebug("bind failed with %d at address %pI6c\n",
0151              ret, &conn->c_laddr);
0152         goto out;
0153     }
0154 
0155     if (isv6) {
0156         sin6.sin6_family = AF_INET6;
0157         sin6.sin6_addr = conn->c_faddr;
0158         sin6.sin6_port = htons(RDS_TCP_PORT);
0159         sin6.sin6_flowinfo = 0;
0160         sin6.sin6_scope_id = conn->c_dev_if;
0161         addr = (struct sockaddr *)&sin6;
0162         addrlen = sizeof(sin6);
0163     } else {
0164         sin.sin_family = AF_INET;
0165         sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
0166         sin.sin_port = htons(RDS_TCP_PORT);
0167         addr = (struct sockaddr *)&sin;
0168         addrlen = sizeof(sin);
0169     }
0170 
0171     /*
0172      * once we call connect() we can start getting callbacks and they
0173      * own the socket
0174      */
0175     rds_tcp_set_callbacks(sock, cp);
0176     ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
0177 
0178     rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
0179     if (ret == -EINPROGRESS)
0180         ret = 0;
0181     if (ret == 0) {
0182         rds_tcp_keepalive(sock);
0183         sock = NULL;
0184     } else {
0185         rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
0186     }
0187 
0188 out:
0189     mutex_unlock(&tc->t_conn_path_lock);
0190     if (sock)
0191         sock_release(sock);
0192     return ret;
0193 }
0194 
0195 /*
0196  * Before killing the tcp socket this needs to serialize with callbacks.  The
0197  * caller has already grabbed the sending sem so we're serialized with other
0198  * senders.
0199  *
0200  * TCP calls the callbacks with the sock lock so we hold it while we reset the
0201  * callbacks to those set by TCP.  Our callbacks won't execute again once we
0202  * hold the sock lock.
0203  */
0204 void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
0205 {
0206     struct rds_tcp_connection *tc = cp->cp_transport_data;
0207     struct socket *sock = tc->t_sock;
0208 
0209     rdsdebug("shutting down conn %p tc %p sock %p\n",
0210          cp->cp_conn, tc, sock);
0211 
0212     if (sock) {
0213         if (rds_destroy_pending(cp->cp_conn))
0214             sock_no_linger(sock->sk);
0215         sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
0216         lock_sock(sock->sk);
0217         rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
0218 
0219         release_sock(sock->sk);
0220         sock_release(sock);
0221     }
0222 
0223     if (tc->t_tinc) {
0224         rds_inc_put(&tc->t_tinc->ti_inc);
0225         tc->t_tinc = NULL;
0226     }
0227     tc->t_tinc_hdr_rem = sizeof(struct rds_header);
0228     tc->t_tinc_data_rem = 0;
0229 }