From 7ce875e5ecb8562fd44040f69bda96c999e38bbc Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Nov 2014 22:22:33 -0500 Subject: ipv4: warn once on passing AF_INET6 socket to ip_recv_error One line change, in response to catching an occurrence of this bug. See also fix f4713a3dfad0 ("net-timestamp: make tcp_recvmsg call ...") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b782657..59eba6c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -414,6 +414,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int err; int copied; + WARN_ON_ONCE(sk->sk_family == AF_INET6); + err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) -- cgit v0.10.2 From 829ae9d611651467fe6cd7be834bd33ca6b28dfe Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Nov 2014 22:22:34 -0500 Subject: net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 1d6d02d..b08e272 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -122,7 +122,7 @@ SOF_TIMESTAMPING_RAW_HARDWARE: 1.3.3 Timestamp Options -The interface supports one option +The interface supports the options SOF_TIMESTAMPING_OPT_ID: @@ -145,6 +145,16 @@ SOF_TIMESTAMPING_OPT_ID: stream sockets, it increments with every byte. +SOF_TIMESTAMPING_OPT_CMSG: + + Support recv() cmsg for all timestamped packets. Control messages + are already supported unconditionally on all packets with receive + timestamps and on IPv6 packets with transmit timestamp. This option + extends them to IPv4 packets with transmit timestamp. One use case + is to correlate packets with their egress device, by enabling socket + option IP_PKTINFO simultaneously. + + 1.4 Bytestream Timestamps The SO_TIMESTAMPING interface supports timestamping of bytes in a diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index ff35402..edbc888 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -23,8 +23,9 @@ enum { SOF_TIMESTAMPING_OPT_ID = (1<<7), SOF_TIMESTAMPING_TX_SCHED = (1<<8), SOF_TIMESTAMPING_TX_ACK = (1<<9), + SOF_TIMESTAMPING_OPT_CMSG = (1<<10), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_CMSG, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 59eba6c..640f26c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, + const struct sk_buff *skb, + int ee_origin) +{ + struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + + if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || + (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + (!skb->dev)) + return false; + + info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; + info->ipi_ifindex = skb->dev->ifindex; + return true; +} + /* * Handle MSG_ERRQUEUE */ @@ -446,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; sin->sin_family = AF_UNSPEC; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; @@ -1051,7 +1069,7 @@ e_inval: } /** - * ipv4_pktinfo_prepare - transfert some info from rtable to skb + * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc11396..2464a00 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,6 +325,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +{ + int ifindex = skb->dev ? skb->dev->ifindex : -1; + + if (skb->protocol == htons(ETH_P_IPV6)) + IP6CB(skb)->iif = ifindex; + else + PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; +} + /* * Handle MSG_ERRQUEUE */ @@ -388,8 +398,12 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; - if (np->rxopt.all) + if (np->rxopt.all) { + if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && + serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) + ip6_datagram_prepare_pktinfo_errqueue(skb); ip6_datagram_recv_common_ctl(sk, msg, skb); + } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) @@ -491,7 +505,10 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } - put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + + if (src_info.ipi6_ifindex >= 0) + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, + sizeof(src_info), &src_info); } } -- cgit v0.10.2 From cbd3aad5ce66f5a266a185aa37e0eb9be9ba4154 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Nov 2014 22:22:35 -0500 Subject: net-timestamp: expand documentation and test Documentation: expand explanation of timestamp counter Test: new: flag -I requests and prints PKTINFO new: flag -x prints payload (possibly truncated) fix: remove pretty print that breaks common flag '-l 1' Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index b08e272..a5c784c 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -130,19 +130,26 @@ SOF_TIMESTAMPING_OPT_ID: have multiple concurrent timestamping requests outstanding. Packets can be reordered in the transmit path, for instance in the packet scheduler. In that case timestamps will be queued onto the error - queue out of order from the original send() calls. This option - embeds a counter that is incremented at send() time, to order - timestamps within a flow. + queue out of order from the original send() calls. It is not always + possible to uniquely match timestamps to the original send() calls + based on timestamp order or payload inspection alone, then. + + This option associates each packet at send() with a unique + identifier and returns that along with the timestamp. The identifier + is derived from a per-socket u32 counter (that wraps). For datagram + sockets, the counter increments with each sent packet. For stream + sockets, it increments with every byte. + + The counter starts at zero. It is initialized the first time that + the socket option is enabled. It is reset each time the option is + enabled after having been disabled. Resetting the counter does not + change the identifiers of existing packets in the system. This option is implemented only for transmit timestamps. There, the timestamp is always looped along with a struct sock_extended_err. The option modifies field ee_data to pass an id that is unique among all possibly concurrently outstanding timestamp requests for - that socket. In practice, it is a monotonically increasing u32 - (that wraps). - - In datagram sockets, the counter increments on each send call. In - stream sockets, it increments with every byte. + that socket. SOF_TIMESTAMPING_OPT_CMSG: diff --git a/Documentation/networking/timestamping/txtimestamp.c b/Documentation/networking/timestamping/txtimestamp.c index b32fc2a..876f71c 100644 --- a/Documentation/networking/timestamping/txtimestamp.c +++ b/Documentation/networking/timestamping/txtimestamp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,14 @@ #include #include +/* ugly hack to work around netinet/in.h and linux/ipv6.h conflicts */ +#ifndef in6_pktinfo +struct in6_pktinfo { + struct in6_addr ipi6_addr; + int ipi6_ifindex; +}; +#endif + /* command line parameters */ static int cfg_proto = SOCK_STREAM; static int cfg_ipproto = IPPROTO_TCP; @@ -65,6 +74,8 @@ static int cfg_num_pkts = 4; static int do_ipv4 = 1; static int do_ipv6 = 1; static int cfg_payload_len = 10; +static bool cfg_show_payload; +static bool cfg_do_pktinfo; static uint16_t dest_port = 9000; static struct sockaddr_in daddr; @@ -131,6 +142,30 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, __print_timestamp(tsname, &tss->ts[0], tskey, payload_len); } +/* TODO: convert to check_and_print payload once API is stable */ +static void print_payload(char *data, int len) +{ + int i; + + if (len > 70) + len = 70; + + fprintf(stderr, "payload: "); + for (i = 0; i < len; i++) + fprintf(stderr, "%02hhx ", data[i]); + fprintf(stderr, "\n"); +} + +static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr) +{ + char sa[INET6_ADDRSTRLEN], da[INET6_ADDRSTRLEN]; + + fprintf(stderr, " pktinfo: ifindex=%u src=%s dst=%s\n", + ifindex, + saddr ? inet_ntop(family, saddr, sa, sizeof(sa)) : "unknown", + daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown"); +} + static void __poll(int fd) { struct pollfd pollfd; @@ -156,10 +191,9 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) cm->cmsg_type == SCM_TIMESTAMPING) { tss = (void *) CMSG_DATA(cm); } else if ((cm->cmsg_level == SOL_IP && - cm->cmsg_type == IP_RECVERR) || - (cm->cmsg_level == SOL_IPV6 && - cm->cmsg_type == IPV6_RECVERR)) { - + cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_RECVERR)) { serr = (void *) CMSG_DATA(cm); if (serr->ee_errno != ENOMSG || serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) { @@ -168,6 +202,16 @@ static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len) serr->ee_origin); serr = NULL; } + } else if (cm->cmsg_level == SOL_IP && + cm->cmsg_type == IP_PKTINFO) { + struct in_pktinfo *info = (void *) CMSG_DATA(cm); + print_pktinfo(AF_INET, info->ipi_ifindex, + &info->ipi_spec_dst, &info->ipi_addr); + } else if (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_PKTINFO) { + struct in6_pktinfo *info6 = (void *) CMSG_DATA(cm); + print_pktinfo(AF_INET6, info6->ipi6_ifindex, + NULL, &info6->ipi6_addr); } else fprintf(stderr, "unknown cmsg %d,%d\n", cm->cmsg_level, cm->cmsg_type); @@ -206,7 +250,11 @@ static int recv_errmsg(int fd) if (ret == -1 && errno != EAGAIN) error(1, errno, "recvmsg"); - __recv_errmsg_cmsg(&msg, ret); + if (ret > 0) { + __recv_errmsg_cmsg(&msg, ret); + if (cfg_show_payload) + print_payload(data, cfg_payload_len); + } free(data); return ret == -1; @@ -215,9 +263,9 @@ static int recv_errmsg(int fd) static void do_test(int family, unsigned int opt) { char *buf; - int fd, i, val, total_len; + int fd, i, val = 1, total_len; - if (family == IPPROTO_IPV6 && cfg_proto != SOCK_STREAM) { + if (family == AF_INET6 && cfg_proto != SOCK_STREAM) { /* due to lack of checksum generation code */ fprintf(stderr, "test: skipping datagram over IPv6\n"); return; @@ -239,7 +287,6 @@ static void do_test(int family, unsigned int opt) error(1, errno, "socket"); if (cfg_proto == SOCK_STREAM) { - val = 1; if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*) &val, sizeof(val))) error(1, 0, "setsockopt no nagle"); @@ -253,7 +300,20 @@ static void do_test(int family, unsigned int opt) } } + if (cfg_do_pktinfo) { + if (family == AF_INET6) { + if (setsockopt(fd, SOL_IPV6, IPV6_RECVPKTINFO, + &val, sizeof(val))) + error(1, errno, "setsockopt pktinfo ipv6"); + } else { + if (setsockopt(fd, SOL_IP, IP_PKTINFO, + &val, sizeof(val))) + error(1, errno, "setsockopt pktinfo ipv4"); + } + } + opt |= SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_CMSG | SOF_TIMESTAMPING_OPT_ID; if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, (char *) &opt, sizeof(opt))) @@ -262,8 +322,6 @@ static void do_test(int family, unsigned int opt) for (i = 0; i < cfg_num_pkts; i++) { memset(&ts_prev, 0, sizeof(ts_prev)); memset(buf, 'a' + i, total_len); - buf[total_len - 2] = '\n'; - buf[total_len - 1] = '\0'; if (cfg_proto == SOCK_RAW) { struct udphdr *udph; @@ -324,11 +382,13 @@ static void __attribute__((noreturn)) usage(const char *filepath) " -4: only IPv4\n" " -6: only IPv6\n" " -h: show this message\n" + " -I: request PKTINFO\n" " -l N: send N bytes at a time\n" " -r: use raw\n" " -R: use raw (IP_HDRINCL)\n" " -p N: connect to port N\n" - " -u: use udp\n", + " -u: use udp\n" + " -x: show payload (up to 70 bytes)\n", filepath); exit(1); } @@ -338,7 +398,7 @@ static void parse_opt(int argc, char **argv) int proto_count = 0; char c; - while ((c = getopt(argc, argv, "46hl:p:rRu")) != -1) { + while ((c = getopt(argc, argv, "46hIl:p:rRux")) != -1) { switch (c) { case '4': do_ipv6 = 0; @@ -346,6 +406,9 @@ static void parse_opt(int argc, char **argv) case '6': do_ipv4 = 0; break; + case 'I': + cfg_do_pktinfo = true; + break; case 'r': proto_count++; cfg_proto = SOCK_RAW; @@ -367,6 +430,9 @@ static void parse_opt(int argc, char **argv) case 'p': dest_port = strtoul(optarg, NULL, 10); break; + case 'x': + cfg_show_payload = true; + break; case 'h': default: usage(argv[0]); -- cgit v0.10.2