Re: [PATCH 1/1] hw/net: Added basic IPv6 fragmentation. Fixed IPv6 paylo

On Fri, May 29, 2020 at 12:11 PM Jason Wang <jasowang@redhat.com> wrote:

On 2020/5/8 上午3:25, andrew@daynix.com wrote:
> From: Andrew Melnychenko <andrew@daynix.com>
>
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1708065
> Overall, there was an issue that big frames of IPv6 doesn't sent.
> With network backend with 'virtual header' - there was an issue
> in 'plen' field. Overall, during TSO, 'plen' would be changed,
> but with 'vheader' this field should be set to the size of the
> payload itself instead of '0'.
> For software offload - there is added basic IPv6 fragmentation.

Please introduce a separate patch to do this.

> Also fixed checksum offload for IPv6.

And another patch for this.

> The basic IPv6 fragmentation - adding 'frag' extension to
> the packet, overall shares some logic with IPv4. It works,
> but there are still issues with a combination of
> extensions - in the future, it would require refactoring
> work to implement workflow with IPv6 and extension.

Did you mean the headeroom might not be enough?

> e1000e driver doesn't set the 'plen' field for IPv6 for big packets
> if TSO is enabled. "Jumbo option" isn't added yet, until
> qemu supports packets greater than 64K.
>
> Signed-off-by: Andrew Melnychenko <andrew@daynix.com>
> ---
> hw/net/net_tx_pkt.c | 54 ++++++++++++++++++++++++---
> hw/net/net_tx_pkt.h | 7 ++++
> include/net/eth.h | 15 ++++++--
> net/eth.c | 89 ++++++++++++++++++++++++++++++++++++++++++---
> 4 files changed, 151 insertions(+), 14 deletions(-)
>
> diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
> index 162f802dd7..895effecb9 100644
> --- a/hw/net/net_tx_pkt.c
> +++ b/hw/net/net_tx_pkt.c
> @@ -468,8 +468,8 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
> /* num of iovec without vhdr */
> uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1;
> uint16_t csl;
> - struct ip_header *iphdr;
> size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset;
> + uint16_t l3_proto = eth_get_l3_proto(iov, 1, iov->iov_len);
>
> /* Put zero to checksum field */
> iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum);
> @@ -477,9 +477,18 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
> /* Calculate L4 TCP/UDP checksum */
> csl = pkt->payload_len;
>
> + csum_cntr = 0;
> + cso = 0;
> /* add pseudo header to csum */
> - iphdr = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
> - csum_cntr = eth_calc_ip4_pseudo_hdr_csum(iphdr, csl, &cso);
> + if (l3_proto == ETH_P_IP) {
> + csum_cntr = eth_calc_ip4_pseudo_hdr_csum(
> + pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base,
> + csl, &cso);
> + } else if (l3_proto == ETH_P_IPV6) {
> + csum_cntr = eth_calc_ip6_pseudo_hdr_csum(
> + pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base,
> + csl, pkt->l4proto, &cso);
> + }
>
> /* data checksum */
> csum_cntr +=
> @@ -580,10 +589,11 @@ static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt,
>
> more_frags = (fragment_offset + fragment_len < pkt->payload_len);
>
> - eth_setup_ip4_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
> - l3_iov_len, fragment_len, fragment_offset, more_frags);
> + eth_setup_ip_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
> + &l3_iov_len, ETH_MAX_IP_DGRAM_LEN,
> + fragment_len, fragment_offset, more_frags);
>
> - eth_fix_ip4_checksum(l3_iov_base, l3_iov_len);
> + fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_len = l3_iov_len;
>
> net_tx_pkt_sendv(pkt, nc, fragment, dst_idx);
>
> @@ -617,6 +627,7 @@ bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc)
>
> if (pkt->has_virt_hdr ||
> pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
> + net_tx_pkt_fix_ip6_payload_len(pkt);
> net_tx_pkt_sendv(pkt, nc, pkt->vec,
> pkt->payload_frags + NET_TX_PKT_PL_START_FRAG);
> return true;
> @@ -635,3 +646,34 @@ bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, NetClientState *nc)
>
> return res;
> }
> +
> +void net_tx_pkt_fix_ip6_payload_len(struct NetTxPkt *pkt)
> +{
> + /*
> + * If ipv6 payload length field is 0 - then there should be Hop-by-Hop
> + * option for packets greater than 65,535.
> + * For packets with payload less than 65,535: fix 'plen' field.
> + * For now, qemu drops every packet with size greater 64K
> + * (see net_tx_pkt_send()) so, there is no reason to add jumbo option to ip6
> + * hop-by-hop extension if it's missed
> + */
> +
> + struct iovec *l2 = &pkt->vec[NET_TX_PKT_L2HDR_FRAG];
> + if (eth_get_l3_proto(l2, 1, l2->iov_len) == ETH_P_IPV6) {
> + struct ip6_header *ip6 = (struct ip6_header *) pkt->l3_hdr;
> + /*
> + * TODO: if qemu would support >64K packets - add jumbo option check
> + * something like that:
> + * 'if (ip6->ip6_plen == 0 && !has_jumbo_option(ip6)) {'
> + */
> + if (ip6->ip6_plen == 0) {
> + if (pkt->payload_len <= ETH_MAX_IP_DGRAM_LEN) {
> + ip6->ip6_plen = htons(pkt->payload_len);
> + }
> + /*
> + * TODO: if qemu would support >64K packets
> + * add jumbo option for packets greater then 65,535 bytes
> + */
> + }
> + }
> +}
> diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h
> index 212ecc62fc..912d56ef13 100644
> --- a/hw/net/net_tx_pkt.h
> +++ b/hw/net/net_tx_pkt.h
> @@ -187,4 +187,11 @@ bool net_tx_pkt_parse(struct NetTxPkt *pkt);
> */
> bool net_tx_pkt_has_fragments(struct NetTxPkt *pkt);
>
> +/**
> + * Fix IPv6 'plen' field.
> + *

Need be verbose on this.

> + * @pkt packet
> + */
> +void net_tx_pkt_fix_ip6_payload_len(struct NetTxPkt *pkt);
> +
> #endif
> diff --git a/include/net/eth.h b/include/net/eth.h
> index 7f45c678e7..05c75ac9fc 100644
> --- a/include/net/eth.h
> +++ b/include/net/eth.h
> @@ -139,6 +139,14 @@ struct ip6_ext_hdr_routing {
> uint8_t rsvd[4];
> };
>
> +struct ip6_ext_hdr_fragment {
> + uint8_t nxt;
> + uint8_t res0;
> + uint16_t off;
> + uint32_t id;
> +};
> +
> +
> struct ip6_option_hdr {
> #define IP6_OPT_PAD1 (0x00)
> #define IP6_OPT_HOME (0xC9)
> @@ -186,6 +194,7 @@ struct tcp_hdr {
>
> #define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt
> #define ip6_ecn_acc ip6_ctlun.ip6_un3.ip6_un3_ecn
> +#define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen
>
> #define PKT_GET_ETH_HDR(p) \
> ((struct eth_header *)(p))
> @@ -398,9 +407,9 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt,
> eth_ip4_hdr_info *ip4hdr_info,
> eth_l4_hdr_info *l4hdr_info);
>
> -void eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
> - void *l3hdr, size_t l3hdr_len,
> - size_t l3payload_len,
> +void eth_setup_ip_fragmentation(const void *l2hdr, size_t l2hdr_len,
> + void *l3hdr, size_t *l3hdr_len,
> + size_t l3hdr_max_len, size_t l3payload_len,
> size_t frag_offset, bool more_frags);
>
> void
> diff --git a/net/eth.c b/net/eth.c
> index 0c1d413ee2..21ec5dc33d 100644
> --- a/net/eth.c
> +++ b/net/eth.c
> @@ -314,10 +314,62 @@ eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff,
> return 0;
> }
>
> +static bool eth_is_ip6_extension_header_type(uint8_t hdr_type);
> +
> +static void *eth_ip6_find_ext(struct ip6_header *ip6, uint8_t ext_type)
> +{
> + uint8_t curr_ext_hdr_type = ip6->ip6_nxt;
> + struct ip6_ext_hdr *ext_hdr = (struct ip6_ext_hdr *)(ip6 + 1);
> + for (; eth_is_ip6_extension_header_type(curr_ext_hdr_type);) {
> + if (curr_ext_hdr_type == ext_type) {
> + return ext_hdr;
> + }
> + curr_ext_hdr_type = ext_hdr->ip6r_nxt;
> + ext_hdr = (struct ip6_ext_hdr *)(((uint8_t *)ext_hdr)
> + + (ext_hdr->ip6r_len + 1) * IP6_EXT_GRANULARITY);
> + }
> +
> + return NULL;
> +}
> +
> +/*
> + * To add an extension - there is should be
> + * enough memory 'behind' the ip6 header.
> + */
> +static void *eth_ip6_add_ext_nonsafe(struct ip6_header *ip6, uint8_t ext_type)
> +{

What did "unsafe" suffix meant here?

> + uint8_t curr_ext_hdr_type = ip6->ip6_nxt;
> + struct ip6_ext_hdr *ext_hdr = (struct ip6_ext_hdr *)(ip6 + 1);
> + struct ip6_ext_hdr *ext_hdr_prev = NULL;
> +
> + if (!eth_is_ip6_extension_header_type(curr_ext_hdr_type)) {
> + ext_hdr->ip6r_nxt = ip6->ip6_nxt;
> + ip6->ip6_nxt = ext_type;
> + return ext_hdr;
> + }
> +
> + ext_hdr_prev = ext_hdr;
> + curr_ext_hdr_type = ext_hdr->ip6r_nxt;
> + ext_hdr = (struct ip6_ext_hdr *)(((uint8_t *)ext_hdr)
> + + (ext_hdr->ip6r_len + 1) * IP6_EXT_GRANULARITY);
> +
> + for (; eth_is_ip6_extension_header_type(curr_ext_hdr_type);) {
> + ext_hdr_prev = ext_hdr;
> + curr_ext_hdr_type = ext_hdr->ip6r_nxt;
> + ext_hdr = (struct ip6_ext_hdr *)(((uint8_t *)ext_hdr)
> + + (ext_hdr->ip6r_len + 1) * IP6_EXT_GRANULARITY);
> + }
> +
> + ext_hdr->ip6r_nxt = ext_hdr_prev->ip6r_nxt;
> + ext_hdr_prev->ip6r_nxt = ext_type;
> +
> + return ext_hdr;
> +}
> +
> void
> -eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
> - void *l3hdr, size_t l3hdr_len,
> - size_t l3payload_len,
> +eth_setup_ip_fragmentation(const void *l2hdr, size_t l2hdr_len,
> + void *l3hdr, size_t *l3hdr_len,
> + size_t l3hdr_max_len, size_t l3payload_len,
> size_t frag_offset, bool more_frags)
> {
> const struct iovec l2vec = {
> @@ -325,7 +377,9 @@ eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
> .iov_len = l2hdr_len
> };
>
> - if (eth_get_l3_proto(&l2vec, 1, l2hdr_len) == ETH_P_IP) {
> + uint16_t l3_proto = eth_get_l3_proto(&l2vec, 1, l2hdr_len);
> +
> + if (l3_proto == ETH_P_IP) {
> uint16_t orig_flags;
> struct ip_header *iphdr = (struct ip_header *) l3hdr;
> uint16_t frag_off_units = frag_offset / IP_FRAG_UNIT_SIZE;
> @@ -337,7 +391,32 @@ eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
> orig_flags = be16_to_cpu(iphdr->ip_off) & ~(IP_OFFMASK|IP_MF);
> new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
> iphdr->ip_off = cpu_to_be16(new_ip_off);
> - iphdr->ip_len = cpu_to_be16(l3payload_len + l3hdr_len);
> + iphdr->ip_len = cpu_to_be16(l3payload_len + *l3hdr_len);
> +
> + eth_fix_ip4_checksum(l3hdr, *l3hdr_len);
> + } else if (l3_proto == ETH_P_IPV6) {
> + struct ip6_header *ip6 = (struct ip6_header *) l3hdr;
> +
> + struct ip6_ext_hdr_fragment *frag_ext = NULL;
> +
> + /* Find frag extension */
> + frag_ext = eth_ip6_find_ext(ip6, IP6_FRAGMENT);
> + if (frag_ext == NULL) {
> + /* No frag extension? Add one */
> + if (*l3hdr_len + sizeof(*frag_ext) > l3hdr_max_len) {
> + return; /* TODO: request to reallocate l3hdr */

Let's just implement the allocation here.

> + }
> + frag_ext = eth_ip6_add_ext_nonsafe(ip6, IP6_FRAGMENT);
> + *l3hdr_len += sizeof(*frag_ext);
> + static uint32_t s_id = 0x71656d75; /* 'qemu' */

Please introduce a macro with comment for this magic number.

Thanks

> + frag_ext->id = cpu_to_be32(s_id);
> + ++s_id;
> + }
> +
> + frag_ext->off = cpu_to_be16((frag_offset / IP_FRAG_UNIT_SIZE) << 3
> + | (uint16_t)!!more_frags);
> +
> + ip6->ip6_plen = cpu_to_be16(l3payload_len + *l3hdr_len - sizeof(*ip6));
> }
> }
>

From:	Andrew Melnichenko
Subject:	Re: [PATCH 1/1] hw/net: Added basic IPv6 fragmentation. Fixed IPv6 payload length. Fixed CSO for IPv6.
Date:	Mon, 1 Jun 2020 19:36:00 +0300