diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst index 3fb5fa142eef..a556439f8be7 100644 --- a/Documentation/networking/ipvs-sysctl.rst +++ b/Documentation/networking/ipvs-sysctl.rst @@ -29,6 +29,33 @@ backup_only - BOOLEAN If set, disable the director function while the server is in backup mode to avoid packet loops for DR/TUN methods. +conn_lfactor - INTEGER + Possible values: -8 (larger table) .. 8 (smaller table) + + Default: -4 + + Controls the sizing of the connection hash table based on the + load factor (number of connections per table buckets): + + 2^conn_lfactor = nodes / buckets + + As result, the table grows if load increases and shrinks when + load decreases in the range of 2^8 - 2^conn_tab_bits (module + parameter). + The value is a shift count where negative values select + buckets = (connection hash nodes << -value) while positive + values select buckets = (connection hash nodes >> value). The + negative values reduce the collisions and reduce the time for + lookups but increase the table size. Positive values will + tolerate load above 100% when using smaller table is + preferred with the cost of more collisions. If using NAT + connections consider decreasing the value with one because + they add two nodes in the hash table. + + Example: + -4: grow if load goes above 6% (buckets = nodes * 16) + 2: grow if load goes above 400% (buckets = nodes / 4) + conn_reuse_mode - INTEGER 1 - default @@ -219,6 +246,16 @@ secure_tcp - INTEGER The value definition is the same as that of drop_entry and drop_packet. +svc_lfactor - INTEGER + Possible values: -8 (larger table) .. 8 (smaller table) + + Default: -3 + + Controls the sizing of the service hash table based on the + load factor (number of services per table buckets). The table + will grow and shrink in the range of 2^4 - 2^20. + See conn_lfactor for explanation. + sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period default 3 50 diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 8d65ffbf57de..b39417ad955e 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -16,9 +16,6 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; #ifdef CONFIG_NF_CT_PROTO_SCTP extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp; #endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE -extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite; -#endif #ifdef CONFIG_NF_CT_PROTO_GRE extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre; #endif diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index cd5020835a6d..fde2427ceb8f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -107,11 +107,6 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); -int nf_conntrack_udplite_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, @@ -139,8 +134,6 @@ void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; -#define MAX_NF_CT_PROTO IPPROTO_UDPLITE - const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index da69a27e8332..bbb684f9964c 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -21,8 +22,10 @@ eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par) { unsigned char eui64[8]; - if (!(skb_mac_header(skb) >= skb->head && - skb_mac_header(skb) + ETH_HLEN <= skb->data)) { + if (!skb->dev || skb->dev->type != ARPHRD_ETHER) + return false; + + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) { par->hotdrop = true; return false; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f3ea0cb26f36..682c675125fc 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -209,17 +209,6 @@ config NF_CT_PROTO_SCTP If unsure, say Y. -config NF_CT_PROTO_UDPLITE - bool 'UDP-Lite protocol connection tracking support' - depends on NETFILTER_ADVANCED - default y - help - With this option enabled, the layer 3 independent connection - tracking code will be able to do state tracking on UDP-Lite - connections. - - If unsure, say Y. - config NF_CONNTRACK_AMANDA tristate "Amanda backup protocol support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 2c625e0f49ec..752f59ef8744 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -220,8 +221,8 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ - if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + if (!skb->dev || skb->dev->type != ARPHRD_ETHER || + !skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) return -EINVAL; e.id = ip_to_id(map, ip); diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 467c59a83c0a..b9a2681e2488 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -89,8 +90,8 @@ hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + if (!skb->dev || skb->dev->type != ARPHRD_ETHER || + !skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) return -EINVAL; if (opt->flags & IPSET_DIM_TWO_SRC) @@ -205,8 +206,8 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + if (!skb->dev || skb->dev->type != ARPHRD_ETHER || + !skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) return -EINVAL; if (opt->flags & IPSET_DIM_TWO_SRC) diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 718814730acf..41a122591fe2 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -77,8 +78,8 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); - if (skb_mac_header(skb) < skb->head || - (skb_mac_header(skb) + ETH_HLEN) > skb->data) + if (!skb->dev || skb->dev->type != ARPHRD_ETHER || + !skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) return -EINVAL; if (opt->flags & IPSET_DIM_ONE_SRC) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index a1f070cb76c3..6632daa87ded 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -281,6 +281,20 @@ unlock: mutex_unlock(&ipvs->est_mutex); } +static int get_conn_tab_size(struct netns_ipvs *ipvs) +{ + const struct ip_vs_rht *t; + int size = 0; + + rcu_read_lock(); + t = rcu_dereference(ipvs->conn_tab); + if (t) + size = t->size; + rcu_read_unlock(); + + return size; +} + int ip_vs_use_count_inc(void) { @@ -2431,6 +2445,60 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, return ret; } +static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < -8 || val > 8) { + ret = -EINVAL; + } else { + *valp = val; + if (rcu_access_pointer(ipvs->conn_tab)) + mod_delayed_work(system_unbound_wq, + &ipvs->conn_resize_work, 0); + } + } + return ret; +} + +static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < -8 || val > 8) { + ret = -EINVAL; + } else { + *valp = val; + if (rcu_access_pointer(ipvs->svc_table)) + mod_delayed_work(system_unbound_wq, + &ipvs->svc_resize_work, 0); + } + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -2619,6 +2687,18 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = ipvs_proc_est_nice, }, + { + .procname = "conn_lfactor", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_conn_lfactor, + }, + { + .procname = "svc_lfactor", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_svc_lfactor, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -2741,10 +2821,13 @@ static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) static int ip_vs_info_seq_show(struct seq_file *seq, void *v) { + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, @@ -2907,6 +2990,144 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) return 0; } + +static int ip_vs_status_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned int resched_score = 0; + struct ip_vs_conn_hnode *hn; + struct hlist_bl_head *head; + struct ip_vs_service *svc; + struct ip_vs_rht *t, *pt; + struct hlist_bl_node *e; + int old_gen, new_gen; + u32 counts[8]; + u32 bucket; + int count; + u32 sum1; + u32 sum; + int i; + + rcu_read_lock(); + + t = rcu_dereference(ipvs->conn_tab); + + seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count)); + seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n", + t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); + + if (!atomic_read(&ipvs->conn_count)) + goto after_conns; + old_gen = atomic_read(&ipvs->conn_tab_changes); + +repeat_conn: + smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ + memset(counts, 0, sizeof(counts)); + ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { + for (bucket = 0; bucket < t->size; bucket++) { + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + + count = 0; + resched_score++; + ip_vs_rht_walk_bucket_rcu(t, bucket, head) { + count = 0; + hlist_bl_for_each_entry_rcu(hn, e, head, node) + count++; + } + resched_score += count; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + new_gen = atomic_read(&ipvs->conn_tab_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat_conn; + } + } + counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + } + } + for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) + sum += counts[i]; + sum1 = sum - counts[0]; + seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", + counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + for (i = 1; i < ARRAY_SIZE(counts); i++) { + if (!counts[i]) + continue; + seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", + i, counts[i], + (unsigned long)counts[i] * 100 / max(sum1, 1U)); + } + +after_conns: + t = rcu_dereference(ipvs->svc_table); + + count = ip_vs_get_num_services(ipvs); + seq_printf(seq, "Services:\t%d\n", count); + seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", + t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); + + if (!count) + goto after_svc; + old_gen = atomic_read(&ipvs->svc_table_changes); + +repeat_svc: + smp_rmb(); /* ipvs->svc_table and svc_table_changes */ + memset(counts, 0, sizeof(counts)); + ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { + for (bucket = 0; bucket < t->size; bucket++) { + DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); + + count = 0; + resched_score++; + ip_vs_rht_walk_bucket_rcu(t, bucket, head) { + count = 0; + hlist_bl_for_each_entry_rcu(svc, e, head, + s_list) + count++; + } + resched_score += count; + if (resched_score >= 100) { + resched_score = 0; + cond_resched_rcu(); + new_gen = atomic_read(&ipvs->svc_table_changes); + /* New table installed ? */ + if (old_gen != new_gen) { + old_gen = new_gen; + goto repeat_svc; + } + } + counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + } + } + for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) + sum += counts[i]; + sum1 = sum - counts[0]; + seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", + counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + for (i = 1; i < ARRAY_SIZE(counts); i++) { + if (!counts[i]) + continue; + seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", + i, counts[i], + (unsigned long)counts[i] * 100 / max(sum1, 1U)); + } + +after_svc: + seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", + ipvs->est_kt_count, ipvs->est_max_threads); + seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); + seq_printf(seq, "Stats thread ests:\t%d\n", + ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * + IPVS_EST_NTICKS); + + rcu_read_unlock(); + return 0; +} + #endif /* @@ -3425,7 +3646,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; @@ -3437,8 +3658,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_INFO: { struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; - info.size = ip_vs_conn_tab_size; + info.size = get_conn_tab_size(ipvs); info.num_services = atomic_read(&ipvs->num_services[IP_VS_AF_INET]); if (copy_to_user(user, &info, sizeof(info)) != 0) @@ -4447,7 +4669,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE) || nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, - ip_vs_conn_tab_size)) + get_conn_tab_size(ipvs))) goto nla_put_failure; break; } @@ -4697,6 +4919,16 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_conn_lfactor; + + if (unpriv) + tbl[idx].mode = 0444; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_svc_lfactor; + #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) @@ -4807,6 +5039,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; + if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, + ip_vs_status_show, NULL)) + goto err_status; #endif ret = ip_vs_control_net_init_sysctl(ipvs); @@ -4817,6 +5052,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) err: #ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_status", ipvs->net->proc_net); + +err_status: remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); err_percpu: @@ -4842,6 +5080,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) ip_vs_control_net_cleanup_sysctl(ipvs); cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_status", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 27ce5fda8993..b08189226320 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -323,9 +323,6 @@ nf_ct_get_tuple(const struct sk_buff *skb, #endif case IPPROTO_TCP: case IPPROTO_UDP: -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - case IPPROTO_UDPLITE: -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: #endif @@ -1987,11 +1984,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct, case IPPROTO_ICMPV6: return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); #endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - case IPPROTO_UDPLITE: - return nf_conntrack_udplite_packet(ct, skb, dataoff, - ctinfo, state); -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return nf_conntrack_sctp_packet(ct, skb, dataoff, diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index bc1d96686b9c..50ddd3d613e1 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -103,9 +103,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; #endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - case IPPROTO_UDPLITE: return &nf_conntrack_l4proto_udplite; -#endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return &nf_conntrack_l4proto_gre; #endif diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 0030fbe8885c..cc9b7e5e1935 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -129,91 +129,6 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, return NF_ACCEPT; } -#ifdef CONFIG_NF_CT_PROTO_UDPLITE -static void udplite_error_log(const struct sk_buff *skb, - const struct nf_hook_state *state, - const char *msg) -{ - nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg); -} - -static bool udplite_error(struct sk_buff *skb, - unsigned int dataoff, - const struct nf_hook_state *state) -{ - unsigned int udplen = skb->len - dataoff; - const struct udphdr *hdr; - struct udphdr _hdr; - unsigned int cscov; - - /* Header is too small? */ - hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (!hdr) { - udplite_error_log(skb, state, "short packet"); - return true; - } - - cscov = ntohs(hdr->len); - if (cscov == 0) { - cscov = udplen; - } else if (cscov < sizeof(*hdr) || cscov > udplen) { - udplite_error_log(skb, state, "invalid checksum coverage"); - return true; - } - - /* UDPLITE mandates checksums */ - if (!hdr->check) { - udplite_error_log(skb, state, "checksum missing"); - return true; - } - - /* Checksum invalid? Ignore. */ - if (state->hook == NF_INET_PRE_ROUTING && - state->net->ct.sysctl_checksum && - nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, - state->pf)) { - udplite_error_log(skb, state, "bad checksum"); - return true; - } - - return false; -} - -/* Returns verdict for packet, and may modify conntracktype */ -int nf_conntrack_udplite_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) -{ - unsigned int *timeouts; - - if (udplite_error(skb, dataoff, state)) - return -NF_ACCEPT; - - timeouts = nf_ct_timeout_lookup(ct); - if (!timeouts) - timeouts = udp_get_timeouts(nf_ct_net(ct)); - - /* If we've seen traffic both ways, this is some kind of UDP - stream. Extend timeout. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_REPLIED]); - - if (unlikely((ct->status & IPS_NAT_CLASH))) - return NF_ACCEPT; - - /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_ASSURED, ct); - } else { - nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); - } - return NF_ACCEPT; -} -#endif - #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include @@ -299,26 +214,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; - -#ifdef CONFIG_NF_CT_PROTO_UDPLITE -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = -{ - .l4proto = IPPROTO_UDPLITE, - .allow_clash = true, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = udp_timeout_nlattr_to_obj, - .obj_to_nlattr = udp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDP_MAX, - .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, - .nla_policy = udp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -}; -#endif diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 207b240b14e5..be2953c7d702 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -61,7 +61,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; - case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), @@ -277,7 +276,6 @@ static const char* l4proto_name(u16 proto) case IPPROTO_UDP: return "udp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; - case IPPROTO_UDPLITE: return "udplite"; case IPPROTO_ICMPV6: return "icmpv6"; } diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 0507d67cad27..7a8952b049d1 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -78,7 +78,10 @@ dump_arp_packet(struct nf_log_buf *m, else logflags = NF_LOG_DEFAULT_MASK; - if (logflags & NF_LOG_MACDECODE) { + if ((logflags & NF_LOG_MACDECODE) && + skb->dev && skb->dev->type == ARPHRD_ETHER && + skb_mac_header_was_set(skb) && + skb_mac_header_len(skb) >= ETH_HLEN) { nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); nf_log_dump_vlan(m, skb); @@ -797,6 +800,9 @@ static void dump_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) + return; + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); nf_log_dump_vlan(m, skb); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 3b5434e4ec9c..83b2b5e9759a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -68,7 +68,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } @@ -79,7 +78,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } @@ -99,7 +97,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } @@ -110,7 +107,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || - t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } @@ -415,7 +411,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_UDPLITE: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; @@ -612,7 +607,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, goto find_free_id; #endif case IPPROTO_UDP: - case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 97c0f841fc96..07f51fe75fbe 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -79,23 +79,6 @@ static bool udp_manip_pkt(struct sk_buff *skb, return true; } -static bool udplite_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - struct udphdr *hdr; - - if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct udphdr *)(skb->data + hdroff); - __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true); -#endif - return true; -} - static bool sctp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, @@ -287,9 +270,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb, case IPPROTO_UDP: return udp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); - case IPPROTO_UDPLITE: - return udplite_manip_pkt(skb, iphdroff, hdroff, - tuple, maniptype); case IPPROTO_SCTP: return sctp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index fd8652aa7e88..dca6826af7de 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -457,7 +457,6 @@ static int cttimeout_default_get(struct sk_buff *skb, timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: - case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_ICMPV6: diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b2c24cb919d4..2439cbbd5b26 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -401,7 +401,7 @@ nfulnl_timer(struct timer_list *t) static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) { - u32 size = 0; + u32 mac_len, size = 0; if (!skb_mac_header_was_set(skb)) return 0; @@ -412,14 +412,17 @@ static u32 nfulnl_get_bridge_size(const struct sk_buff *skb) size += nla_total_size(sizeof(u16)); /* tag */ } - if (skb->network_header > skb->mac_header) - size += nla_total_size(skb->network_header - skb->mac_header); + mac_len = skb_mac_header_len(skb); + if (mac_len > 0) + size += nla_total_size(mac_len); return size; } static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb) { + u32 mac_len; + if (!skb_mac_header_was_set(skb)) return 0; @@ -437,12 +440,10 @@ static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff nla_nest_end(inst->skb, nest); } - if (skb->mac_header < skb->network_header) { - int len = (int)(skb->network_header - skb->mac_header); - - if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb))) - goto nla_put_failure; - } + mac_len = skb_mac_header_len(skb); + if (mac_len > 0 && + nla_put(inst->skb, NFULA_L2HDR, mac_len, skb_mac_header(skb))) + goto nla_put_failure; return 0; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index c7ee6f6ff725..58304fd1f70f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -579,6 +579,7 @@ static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) { struct sk_buff *entskb = entry->skb; u32 nlalen = 0; + u32 mac_len; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; @@ -587,9 +588,9 @@ static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + nla_total_size(sizeof(__be16))); - if (entskb->network_header > entskb->mac_header) - nlalen += nla_total_size((entskb->network_header - - entskb->mac_header)); + mac_len = skb_mac_header_len(entskb); + if (mac_len > 0) + nlalen += nla_total_size(mac_len); return nlalen; } @@ -597,6 +598,7 @@ static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) { struct sk_buff *entskb = entry->skb; + u32 mac_len; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; @@ -615,12 +617,10 @@ static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) nla_nest_end(skb, nest); } - if (entskb->mac_header < entskb->network_header) { - int len = (int)(entskb->network_header - entskb->mac_header); - - if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) - goto nla_put_failure; - } + mac_len = skb_mac_header_len(entskb); + if (mac_len > 0 && + nla_put(skb, NFQA_L2HDR, mac_len, skb_mac_header(entskb))) + goto nla_put_failure; return 0; @@ -1004,13 +1004,13 @@ nf_queue_entry_dup(struct nf_queue_entry *e) static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) - __skb_push(skb, skb->network_header - skb->mac_header); + __skb_push(skb, skb_mac_header_len(skb)); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) - __skb_pull(skb, skb->network_header - skb->mac_header); + __skb_pull(skb, skb_mac_header_len(skb)); } #else #define nf_bridge_adjust_skb_data(s) do {} while (0) @@ -1469,8 +1469,7 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, } if (nfqa[NFQA_L2HDR]) { - int mac_header_len = entry->skb->network_header - - entry->skb->mac_header; + u32 mac_header_len = skb_mac_header_len(entry->skb); if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) return -EINVAL; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 425525b90ac9..60ee8d932fcb 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1252,7 +1252,6 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, switch (priv->l4proto) { case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: break; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index ad48dcd45abe..4bce36c3a6a0 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -116,6 +116,11 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, goto out; } iph = ip_hdr(skb); + if (iph->ttl <= 1) { + verdict = NF_DROP; + goto out; + } + ip_decrease_ttl(iph); neigh_table = NEIGH_ARP_TABLE; break; @@ -132,6 +137,11 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, goto out; } ip6h = ipv6_hdr(skb); + if (ip6h->hop_limit <= 1) { + verdict = NF_DROP; + goto out; + } + ip6h->hop_limit--; neigh_table = NEIGH_ND_TABLE; break; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index b39017c80548..9f837fb5ceb4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -819,13 +819,17 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user); /* non-compat version may have padding after verdict */ struct compat_xt_standard_target { - struct compat_xt_entry_target t; - compat_uint_t verdict; + /* Must be last as it ends in a flexible-array member. */ + TRAILING_OVERLAP(struct compat_xt_entry_target, t, data, + compat_uint_t verdict; + ); }; struct compat_xt_error_target { - struct compat_xt_entry_target t; - char errorname[XT_FUNCTION_MAXNAMELEN]; + /* Must be last as it ends in a flexible-array member. */ + TRAILING_OVERLAP(struct compat_xt_entry_target, t, data, + char errorname[XT_FUNCTION_MAXNAMELEN]; + ); }; int xt_compat_check_entry_offsets(const void *base, const char *elems, diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c index c1a70f8f0441..4a12a757ecbf 100644 --- a/net/netfilter/xt_hl.c +++ b/net/netfilter/xt_hl.c @@ -6,6 +6,7 @@ * Hop Limit matching module * (C) 2001-2002 Maciej Soltysiak */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -22,6 +23,18 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_ttl"); MODULE_ALIAS("ip6t_hl"); +static int ttl_mt_check(const struct xt_mtchk_param *par) +{ + const struct ipt_ttl_info *info = par->matchinfo; + + if (info->mode > IPT_TTL_GT) { + pr_err("Unknown TTL match mode: %d\n", info->mode); + return -EINVAL; + } + + return 0; +} + static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ipt_ttl_info *info = par->matchinfo; @@ -41,6 +54,18 @@ static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; } +static int hl_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_hl_info *info = par->matchinfo; + + if (info->mode > IP6T_HL_GT) { + pr_err("Unknown Hop Limit match mode: %d\n", info->mode); + return -EINVAL; + } + + return 0; +} + static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_hl_info *info = par->matchinfo; @@ -65,6 +90,7 @@ static struct xt_match hl_mt_reg[] __read_mostly = { .name = "ttl", .revision = 0, .family = NFPROTO_IPV4, + .checkentry = ttl_mt_check, .match = ttl_mt, .matchsize = sizeof(struct ipt_ttl_info), .me = THIS_MODULE, @@ -73,6 +99,7 @@ static struct xt_match hl_mt_reg[] __read_mostly = { .name = "hl", .revision = 0, .family = NFPROTO_IPV6, + .checkentry = hl_mt6_check, .match = hl_mt6, .matchsize = sizeof(struct ip6t_hl_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 81649da57ba5..4798cd2ca26e 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -29,9 +29,7 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par) if (skb->dev == NULL || skb->dev->type != ARPHRD_ETHER) return false; - if (skb_mac_header(skb) < skb->head) - return false; - if (skb_mac_header(skb) + ETH_HLEN > skb->data) + if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) < ETH_HLEN) return false; ret = ether_addr_equal(eth_hdr(skb)->h_source, info->srcaddr); ret ^= info->invert; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 343e65f377d4..53997771013f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -107,6 +107,28 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } +#define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb) + if (info->bitmask & XT_PHYSDEV_OP_IN) { + if (info->physindev[0] == '\0') + return -EINVAL; + if (X(physindev)) + return -ENAMETOOLONG; + } + + if (info->bitmask & XT_PHYSDEV_OP_OUT) { + if (info->physoutdev[0] == '\0') + return -EINVAL; + + if (X(physoutdev)) + return -ENAMETOOLONG; + } + + if (X(in_mask)) + return -ENAMETOOLONG; + if (X(out_mask)) + return -ENAMETOOLONG; +#undef X + if (!brnf_probed) { brnf_probed = true; request_module("br_netfilter"); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 76e01f292aaf..811e53bee408 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -168,52 +168,41 @@ static int socket_mt_enable_defrag(struct net *net, int family) static int socket_mt_v1_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; - int err; - - err = socket_mt_enable_defrag(par->net, par->family); - if (err) - return err; if (info->flags & ~XT_SOCKET_FLAGS_V1) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); return -EINVAL; } - return 0; + + return socket_mt_enable_defrag(par->net, par->family); } static int socket_mt_v2_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; - int err; - - err = socket_mt_enable_defrag(par->net, par->family); - if (err) - return err; if (info->flags & ~XT_SOCKET_FLAGS_V2) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); return -EINVAL; } - return 0; + + return socket_mt_enable_defrag(par->net, par->family); } static int socket_mt_v3_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo3 *info = (struct xt_socket_mtinfo3 *)par->matchinfo; - int err; - err = socket_mt_enable_defrag(par->net, par->family); - if (err) - return err; if (info->flags & ~XT_SOCKET_FLAGS_V3) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V3); return -EINVAL; } - return 0; + + return socket_mt_enable_defrag(par->net, par->family); } static void socket_mt_destroy(const struct xt_mtdtor_param *par)