0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #include <linux/slab.h>
0034 #include <linux/types.h>
0035 #include <linux/rbtree.h>
0036 #include <linux/bitops.h>
0037 #include <linux/export.h>
0038
0039 #include "rds.h"
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081 static atomic_t rds_cong_generation = ATOMIC_INIT(0);
0082
0083
0084
0085
0086 static LIST_HEAD(rds_cong_monitor);
0087 static DEFINE_RWLOCK(rds_cong_monitor_lock);
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101 static DEFINE_SPINLOCK(rds_cong_lock);
0102 static struct rb_root rds_cong_tree = RB_ROOT;
0103
0104 static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
0105 struct rds_cong_map *insert)
0106 {
0107 struct rb_node **p = &rds_cong_tree.rb_node;
0108 struct rb_node *parent = NULL;
0109 struct rds_cong_map *map;
0110
0111 while (*p) {
0112 int diff;
0113
0114 parent = *p;
0115 map = rb_entry(parent, struct rds_cong_map, m_rb_node);
0116
0117 diff = rds_addr_cmp(addr, &map->m_addr);
0118 if (diff < 0)
0119 p = &(*p)->rb_left;
0120 else if (diff > 0)
0121 p = &(*p)->rb_right;
0122 else
0123 return map;
0124 }
0125
0126 if (insert) {
0127 rb_link_node(&insert->m_rb_node, parent, p);
0128 rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
0129 }
0130 return NULL;
0131 }
0132
0133
0134
0135
0136
0137
0138 static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
0139 {
0140 struct rds_cong_map *map;
0141 struct rds_cong_map *ret = NULL;
0142 unsigned long zp;
0143 unsigned long i;
0144 unsigned long flags;
0145
0146 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
0147 if (!map)
0148 return NULL;
0149
0150 map->m_addr = *addr;
0151 init_waitqueue_head(&map->m_waitq);
0152 INIT_LIST_HEAD(&map->m_conn_list);
0153
0154 for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
0155 zp = get_zeroed_page(GFP_KERNEL);
0156 if (zp == 0)
0157 goto out;
0158 map->m_page_addrs[i] = zp;
0159 }
0160
0161 spin_lock_irqsave(&rds_cong_lock, flags);
0162 ret = rds_cong_tree_walk(addr, map);
0163 spin_unlock_irqrestore(&rds_cong_lock, flags);
0164
0165 if (!ret) {
0166 ret = map;
0167 map = NULL;
0168 }
0169
0170 out:
0171 if (map) {
0172 for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
0173 free_page(map->m_page_addrs[i]);
0174 kfree(map);
0175 }
0176
0177 rdsdebug("map %p for addr %pI6c\n", ret, addr);
0178
0179 return ret;
0180 }
0181
0182
0183
0184
0185
0186 void rds_cong_add_conn(struct rds_connection *conn)
0187 {
0188 unsigned long flags;
0189
0190 rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
0191 spin_lock_irqsave(&rds_cong_lock, flags);
0192 list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
0193 spin_unlock_irqrestore(&rds_cong_lock, flags);
0194 }
0195
0196 void rds_cong_remove_conn(struct rds_connection *conn)
0197 {
0198 unsigned long flags;
0199
0200 rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
0201 spin_lock_irqsave(&rds_cong_lock, flags);
0202 list_del_init(&conn->c_map_item);
0203 spin_unlock_irqrestore(&rds_cong_lock, flags);
0204 }
0205
0206 int rds_cong_get_maps(struct rds_connection *conn)
0207 {
0208 conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
0209 conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
0210
0211 if (!(conn->c_lcong && conn->c_fcong))
0212 return -ENOMEM;
0213
0214 return 0;
0215 }
0216
0217 void rds_cong_queue_updates(struct rds_cong_map *map)
0218 {
0219 struct rds_connection *conn;
0220 unsigned long flags;
0221
0222 spin_lock_irqsave(&rds_cong_lock, flags);
0223
0224 list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
0225 struct rds_conn_path *cp = &conn->c_path[0];
0226
0227 rcu_read_lock();
0228 if (!test_and_set_bit(0, &conn->c_map_queued) &&
0229 !rds_destroy_pending(cp->cp_conn)) {
0230 rds_stats_inc(s_cong_update_queued);
0231
0232
0233
0234
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245 queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
0246 }
0247 rcu_read_unlock();
0248 }
0249
0250 spin_unlock_irqrestore(&rds_cong_lock, flags);
0251 }
0252
0253 void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
0254 {
0255 rdsdebug("waking map %p for %pI4\n",
0256 map, &map->m_addr);
0257 rds_stats_inc(s_cong_update_received);
0258 atomic_inc(&rds_cong_generation);
0259 if (waitqueue_active(&map->m_waitq))
0260 wake_up(&map->m_waitq);
0261 if (waitqueue_active(&rds_poll_waitq))
0262 wake_up_all(&rds_poll_waitq);
0263
0264 if (portmask && !list_empty(&rds_cong_monitor)) {
0265 unsigned long flags;
0266 struct rds_sock *rs;
0267
0268 read_lock_irqsave(&rds_cong_monitor_lock, flags);
0269 list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
0270 spin_lock(&rs->rs_lock);
0271 rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
0272 rs->rs_cong_mask &= ~portmask;
0273 spin_unlock(&rs->rs_lock);
0274 if (rs->rs_cong_notify)
0275 rds_wake_sk_sleep(rs);
0276 }
0277 read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
0278 }
0279 }
0280 EXPORT_SYMBOL_GPL(rds_cong_map_updated);
0281
0282 int rds_cong_updated_since(unsigned long *recent)
0283 {
0284 unsigned long gen = atomic_read(&rds_cong_generation);
0285
0286 if (likely(*recent == gen))
0287 return 0;
0288 *recent = gen;
0289 return 1;
0290 }
0291
0292
0293
0294
0295
0296
0297
0298
0299 void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
0300 {
0301 unsigned long i;
0302 unsigned long off;
0303
0304 rdsdebug("setting congestion for %pI4:%u in map %p\n",
0305 &map->m_addr, ntohs(port), map);
0306
0307 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
0308 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
0309
0310 set_bit_le(off, (void *)map->m_page_addrs[i]);
0311 }
0312
0313 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
0314 {
0315 unsigned long i;
0316 unsigned long off;
0317
0318 rdsdebug("clearing congestion for %pI4:%u in map %p\n",
0319 &map->m_addr, ntohs(port), map);
0320
0321 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
0322 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
0323
0324 clear_bit_le(off, (void *)map->m_page_addrs[i]);
0325 }
0326
0327 static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
0328 {
0329 unsigned long i;
0330 unsigned long off;
0331
0332 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
0333 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
0334
0335 return test_bit_le(off, (void *)map->m_page_addrs[i]);
0336 }
0337
0338 void rds_cong_add_socket(struct rds_sock *rs)
0339 {
0340 unsigned long flags;
0341
0342 write_lock_irqsave(&rds_cong_monitor_lock, flags);
0343 if (list_empty(&rs->rs_cong_list))
0344 list_add(&rs->rs_cong_list, &rds_cong_monitor);
0345 write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
0346 }
0347
0348 void rds_cong_remove_socket(struct rds_sock *rs)
0349 {
0350 unsigned long flags;
0351 struct rds_cong_map *map;
0352
0353 write_lock_irqsave(&rds_cong_monitor_lock, flags);
0354 list_del_init(&rs->rs_cong_list);
0355 write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
0356
0357
0358 spin_lock_irqsave(&rds_cong_lock, flags);
0359 map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
0360 spin_unlock_irqrestore(&rds_cong_lock, flags);
0361
0362 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
0363 rds_cong_clear_bit(map, rs->rs_bound_port);
0364 rds_cong_queue_updates(map);
0365 }
0366 }
0367
0368 int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
0369 struct rds_sock *rs)
0370 {
0371 if (!rds_cong_test_bit(map, port))
0372 return 0;
0373 if (nonblock) {
0374 if (rs && rs->rs_cong_monitor) {
0375 unsigned long flags;
0376
0377
0378
0379 spin_lock_irqsave(&rs->rs_lock, flags);
0380 rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
0381 spin_unlock_irqrestore(&rs->rs_lock, flags);
0382
0383
0384
0385 if (!rds_cong_test_bit(map, port))
0386 return 0;
0387 }
0388 rds_stats_inc(s_cong_send_error);
0389 return -ENOBUFS;
0390 }
0391
0392 rds_stats_inc(s_cong_send_blocked);
0393 rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
0394
0395 return wait_event_interruptible(map->m_waitq,
0396 !rds_cong_test_bit(map, port));
0397 }
0398
0399 void rds_cong_exit(void)
0400 {
0401 struct rb_node *node;
0402 struct rds_cong_map *map;
0403 unsigned long i;
0404
0405 while ((node = rb_first(&rds_cong_tree))) {
0406 map = rb_entry(node, struct rds_cong_map, m_rb_node);
0407 rdsdebug("freeing map %p\n", map);
0408 rb_erase(&map->m_rb_node, &rds_cong_tree);
0409 for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
0410 free_page(map->m_page_addrs[i]);
0411 kfree(map);
0412 }
0413 }
0414
0415
0416
0417
0418 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
0419 {
0420 struct rds_cong_map *map = conn->c_lcong;
0421 struct rds_message *rm;
0422
0423 rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
0424 if (!IS_ERR(rm))
0425 rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
0426
0427 return rm;
0428 }