diff --git a/include/linux/skbuff.h b/include/linux/skbuff.hindex 89bcfe8..9298207 100644--- a/include/linux/skbuff.h+++ b/include/linux/skbuff.h@@ -533,8 +533,10 @@ struct sk_buff { * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM.+ *+ * Increased the CB to hold pointer to an FEC structure. */- char cb[48] __aligned(8);+ char cb[56] __aligned(8); unsigned long _skb_refdst; #ifdef CONFIG_XFRMdiff --git a/include/linux/tcp.h b/include/linux/tcp.hindex 4345d49..ccc0e91 100644--- a/include/linux/tcp.h+++ b/include/linux/tcp.h@@ -79,6 +79,24 @@ struct tcp_sack_block { #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/+/* Flags transmitted in the first FEC option byte after magic bytes+ * (except if option is used for negotiation) */+#define TCP_FEC_RECOVERY_CWR 0x80 /* Recovery triggered CWR */+#define TCP_FEC_RECOVERY_SUCCESSFUL 0x40 /* Local recovery done */+#define TCP_FEC_RECOVERY_FAILED 0x20 /* Local recovery failed */+#define TCP_FEC_ENCODED 0x10 /* Packet is FEC-encoded */++struct tcp_fec {+ u8 type; /* Requested FEC type (negotiation only,+ * see net/tcp_fec.h for type defs) */+ u32 enc_seq; /* Sequence number of first encoded byte */+ u32 enc_len; /* Encoding length */+ u32 lost_seq; /* Sequence number of first lost byte */+ u32 lost_len; /* Loss length */+ u8 flags; /* See flag definitions above */+ bool saw_fec; /* FEC option was retrieved from packet */+};+ struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */@@ -95,12 +113,14 @@ struct tcp_options_received { u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */+ struct tcp_fec fec; /* FEC-related parameters */ }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0;+ memset(&(rx_opt->fec), 0, sizeof(struct tcp_fec)); } /* This is the max number of SACKS that we'll generate and process. It's safe@@ -327,6 +347,24 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk;++/* TCP FEC parameters+ * type - negotiated FEC type to be used+ * next_seq - next sequence which was not FEC-encoded before+ * lost_len - bytes after rcv_nxt considered lost+ * flags - see TCP_FEC_* flag definitions above+ * bytes_rcv_queue - number of bytes stored in queued SKBs+ * rcv_queue - copies from the socket's receive queue kept for+ * FEC recovery+ */+ struct {+ u8 type;+ u32 next_seq;+ u32 lost_len;+ u8 flags;+ u32 bytes_rcv_queue;+ struct sk_buff_head rcv_queue;+ } fec; }; enum tsq_flags {diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.hindex 06d0d0f..063aa59 100644--- a/include/net/inet_connection_sock.h+++ b/include/net/inet_connection_sock.h@@ -138,6 +138,7 @@ struct inet_connection_sock { #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ #define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */+#define ICSK_TIME_FEC 6 /* FEC delayed send timer */ static inline struct inet_connection_sock *inet_csk(const struct sock *sk) {@@ -228,7 +229,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, } if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||- what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) {+ what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||+ what == ICSK_TIME_FEC) { icsk->icsk_pending = what; icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);diff --git a/include/net/request_sock.h b/include/net/request_sock.hindex 610fa9e..1c1b4ba 100644--- a/include/net/request_sock.h+++ b/include/net/request_sock.h@@ -63,6 +63,8 @@ struct request_sock { struct sock *sk; u32 secid; u32 peer_secid;+ u8 fec_type; /* Encoding type (see+ * net/tcp_fec.h) */ }; static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)diff --git a/include/net/tcp.h b/include/net/tcp.hindex d59f206..f894889 100644--- a/include/net/tcp.h+++ b/include/net/tcp.h@@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989+#define TCPOPT_FEC_MAGIC 0xDC60 /* * TCP option lengths@@ -195,6 +196,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4+/*+ * !!! TCP FEC patch !!!+ */+#define TCPOLEN_EXP_FEC_BASE 4+ /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4@@ -204,6 +210,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4+#define TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */@@ -230,6 +237,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TFO_SERVER_WO_SOCKOPT1 0x400 #define TFO_SERVER_WO_SOCKOPT2 0x800+/*+ * !!! TCP FEC patch !!!+ */++/* Maximum number of in-order bytes kept in the receiver's buffer for FEC+ * recoveries. The sender will never send more than this in a single FEC+ * packet. */+#define FEC_RCV_QUEUE_LIMIT 16000+ extern struct inet_timewait_death_row tcp_death_row; /* sysctl variables for tcp */@@ -274,6 +290,12 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit;++/*+ * !!! TCP FEC patch !!!+ */+extern int sysctl_tcp_fec;+ extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking;@@ -725,6 +747,7 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ __u32 ack_seq; /* Sequence number ACK'd */+ struct tcp_fec *fec; /* FEC parameters */ }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))@@ -1131,6 +1154,11 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->ecn_ok = 0; ireq->ir_rmt_port = tcp_hdr(skb)->source; ireq->ir_num = ntohs(tcp_hdr(skb)->dest);++ /*+ * !!! TCP FEC patch !!!+ */+ req->fec_type = rx_opt->fec.type; } extern void tcp_openreq_init_rwin(struct request_sock *req,diff --git a/include/net/tcp_fec.h b/include/net/tcp_fec.hnew file mode 100644index 0000000..38f2c40--- /dev/null+++ b/include/net/tcp_fec.h@@ -0,0 +1,106 @@+#ifndef _TCP_FEC_H+#define _TCP_FEC_H++#include <net/tcp.h>+#include <asm/unaligned.h>++/* FEC-encoding types (8 bits, internal) */+#define TCP_FEC_TYPE_NONE 0 /* FEC disabled */+#define TCP_FEC_TYPE_XOR_ALL 1 /* XOR every MSS length segment */+#define TCP_FEC_TYPE_XOR_SKIP_1 2 /* XOR every other MSS length+ * segment */++#define TCP_FEC_NUM_TYPES 3++/* Delay transmission of FEC packets (delay defined in tcp_fec_arm_timer()) */+#define TCP_FEC_DELAYED_SEND 1++/*+ * Returns true if FEC is enabled for the socket+ */+static inline bool tcp_fec_is_enabled(const struct tcp_sock *tp)+{+ return unlikely(tp->fec.type > 0);+}++/*+ * Returns true if the current packet in the buffer is FEC-encoded+ */+static inline bool tcp_fec_is_encoded(const struct tcp_sock *tp)+{+ return unlikely((tp->rx_opt.fec.flags & TCP_FEC_ENCODED) &&+ (tp->rx_opt.fec.saw_fec));+}++/*+ * Decodes FEC parameters and stores them in the FEC struct+ * @seq - sequence number of the packet+ * @ack_seq - ACKed sequence number+ * @is_syn - true, if option was attached to a packet with a SYN flag+ * @ptr - points to the first byte of the FEC option after kind, length,+ * and possible magic bytes+ * @len - option length (without kind, length, magic bytes)+ */+int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,+ bool is_syn, const unsigned char *ptr,+ unsigned int len);++/*+ * Encodes FEC parameters to wire format+ * Pointer points to the first byte of the FEC option after kind, length,+ * and possible magic bytes (pointer will be moved to first unoccupied byte)+ */+int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,+ __be32 **ptr);++/*+ * Processes the current packet in the buffer (treated as FEC packet)+ */+int tcp_fec_process(struct sock *sk, struct sk_buff *skb);++/*+ * Checks the received options for loss indicators and acts upon them.+ * In particular, the function handles window reduction requests and processes+ * tail loss indicators.+ * Returns: 1, if window is reduced - 0, otherwise+ */+int tcp_fec_check_ack(struct sock *sk, u32 ack_seq);++/*+ * Since data in the socket's receive queue can get consumed by other parties+ * we need to keep extra references these SKBs until they are no longer+ * required for possible future recoveries.+ * @skb - buffer which is moved to the receive queue+ */+int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb);++/*+ * Disables FEC for this connection (includes clearing references+ * to buffers in receive queue)+ */+void tcp_fec_disable(struct sock *sk);++/* Arms the timer for a delayed FEC transmission if there is+ * no earlier timeout defined (i.e. retransmission timeout)+ */+void tcp_fec_arm_timer(struct sock *sk);++/* The FEC timer fired. Force an FEC transmission for the+ * last unencoded burst. Rearm the RTO timer (which was switched+ * out when setting the FEC timer). Set a new FEC timer if there+ * is pending unencoded data.+ */+void tcp_fec_timer(struct sock *sk);++/* If FEC packets transmissions are delayed set a timer+ * (if not already set), otherwise invoke the FEC mechanism+ * immediately+ */+int tcp_fec_invoke(struct sock *sk);++/* Invoke the FEC mechanism set for the connection;+ * Create and sends out FEC packets+ */+int tcp_fec_invoke_nodelay(struct sock *sk);++#endifdiff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.hindex 3b97183..d679733 100644--- a/include/uapi/linux/tcp.h+++ b/include/uapi/linux/tcp.h@@ -113,6 +113,11 @@ enum { #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */+/*+ * !!! TCP FEC patch !!!+ */+#define TCP_FEC 79 /* Forward error correction */+ struct tcp_repair_opt { __u32 opt_code; __u32 opt_val;diff --git a/net/ipv4/Makefile b/net/ipv4/Makefileindex 518c04e..6aa32ca 100644--- a/net/ipv4/Makefile+++ b/net/ipv4/Makefile@@ -6,7 +6,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ inet_timewait_sock.o inet_connection_sock.o \- tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \+ tcp.o tcp_fec.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.cindex 0d438fb..9cfa3d3 100644--- a/net/ipv4/inet_diag.c+++ b/net/ipv4/inet_diag.c@@ -183,7 +183,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||+ icsk->icsk_pending == ICSK_TIME_FEC) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.cindex eeb17b3..9c58530 100644--- a/net/ipv4/sysctl_net_ipv4.c+++ b/net/ipv4/sysctl_net_ipv4.c@@ -28,6 +28,7 @@ static int zero; static int one = 1;+static int two = 2; static int four = 4; static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255;@@ -810,6 +811,15 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one },+ {+ .procname = "tcp_fec",+ .data = &sysctl_tcp_fec,+ .maxlen = sizeof(int),+ .mode = 0644,+ .proc_handler = proc_dointvec,+ .extra1 = &zero,+ .extra2 = &two,+ }, { } };diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.cindex b8ff562..1a2dab5 100644--- a/net/ipv4/tcp.c+++ b/net/ipv4/tcp.c@@ -276,6 +276,8 @@ #include <net/ip.h> #include <net/sock.h>+#include <net/tcp_fec.h>+ #include <asm/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h>@@ -2565,6 +2567,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break;+ case TCP_FEC:+ if (sysctl_tcp_fec && val >= 0 && val < TCP_FEC_NUM_TYPES)+ tp->fec.type = val;+ else+ err = -EINVAL;+ break; case TCP_NOTSENT_LOWAT: tp->notsent_lowat = val; sk->sk_write_space(sk);@@ -2792,6 +2800,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break;+ case TCP_FEC:+ val = tp->fec.type;+ break; case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break;diff --git a/net/ipv4/tcp_fec.c b/net/ipv4/tcp_fec.cnew file mode 100644index 0000000..53711cb--- /dev/null+++ b/net/ipv4/tcp_fec.c@@ -0,0 +1,1253 @@+#include <net/tcp_fec.h>++/* Codes for incoming FEC packet processing */+#define FEC_NO_LOSS 1+#define FEC_LOSS_UNRECOVERED 2+#define FEC_LOSS_RECOVERED 3++/* Receiver routines */+static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,+ unsigned int block_skip);+static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,+ unsigned char *data, u32 seq, int len);+static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,+ int recovery_status);+static void tcp_fec_reduce_window(struct sock *sk);+static void tcp_fec_mark_skbs_lost(struct sock *sk);+static bool tcp_fec_update_decoded_option(struct sk_buff *skb);+static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,+ const struct sk_buff *skb, unsigned char *dec_data,+ u32 seq, unsigned int len);++/* Sender routines */+static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list);+static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,+ unsigned int first_seq, unsigned int block_len,+ unsigned int block_skip,+ unsigned int max_encoded_per_pkt);+static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,+ struct tcp_fec *fec, unsigned char *enc_data,+ u32 seq);+static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list);+static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb);++/* Buffer access routine */+static unsigned int tcp_fec_get_next_block(struct sock *sk,+ struct sk_buff **skb, struct sk_buff_head *queue,+ u32 seq, unsigned int block_len,+ unsigned char *block);++/* Have to define this signature here since the actual function was static+ * and tcp_output.c has no corresponding header file+ */+extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,+ gfp_t gfp_mask);++/* Decodes FEC parameters and stores them in the FEC struct+ * @seq - sequence number of the packet+ * @ack_seq - ACKed sequence number+ * @is_syn - true, if option was attached to a packet with a SYN flag+ * @ptr - points to the first byte of the FEC option after kind, length,+ * and possible magic bytes+ * @len - option length (without kind, length, magic bytes)+ */+int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,+ bool is_syn, const unsigned char *ptr,+ unsigned int len)+{+ /* reset / initialize option values which should be evaluated+ * with EVERY incoming packet+ */+ fec->flags = 0;+ fec->saw_fec = 1;++ if (len == 1) {+ /* Short option */+ u8 val = *((u8 *) ptr);+ if (is_syn) {+ /* Negotiation */+ fec->type = val;+ } else {+ /* Regular packet */+ fec->flags = val;+ }++ return 0;+ }++ if (len == 4) {+ /* Long option */+ u32 val = get_unaligned_be32(ptr);+ fec->flags = val >> 24;++ if (fec->flags & TCP_FEC_ENCODED) {+ fec->enc_seq = seq;+ fec->enc_len = val & 0xFFFFFF;+ } else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {+ fec->lost_seq = ack_seq;+ fec->lost_len = val & 0xFFFFFF;+ } else {+ return -EINVAL;+ }++ return 0;+ }++ /* Invalid option length */+ return -EINVAL;+}++/* Encodes FEC parameters to wire format+ * @ptr - Encoded option is written to this memory location (and the pointer+ * is advanced to the next unoccupied byte, 4-byte aligned)+ * Returns the length of the encoded option (including alignment)+ */+int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,+ __be32 **ptr)+{+ int len;++ fec->flags |= tp->fec.flags;+ fec->lost_len = tp->fec.lost_len;+ tp->fec.flags &= ~TCP_FEC_RECOVERY_CWR;+ tp->fec.flags &= ~TCP_FEC_RECOVERY_FAILED;++ /* Encode fixed option part (option kind, length, and magic bytes) */+ if (fec->flags & (TCP_FEC_ENCODED | TCP_FEC_RECOVERY_FAILED))+ len = 4 + TCPOLEN_EXP_FEC_BASE; /* Long option */+ else+ len = 1 + TCPOLEN_EXP_FEC_BASE; /* Short option */++ **ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FEC_MAGIC);+ (*ptr)++;++ if ((fec->flags & TCP_FEC_ENCODED) &&+ (fec->flags & TCP_FEC_RECOVERY_FAILED)) {+ /* TODO Special case: need to separate loss indication+ * from encoding or make option 12 bytes long+ * This can only happen if a node receives and sends FEC+ * data+ */+ fec->flags &= ~TCP_FEC_RECOVERY_FAILED;+ }++ if (fec->flags & TCP_FEC_ENCODED) {+ /* FEC-encoded packets carry:+ * <Flags:8, Encoding length:24>+ */+ **ptr = htonl((fec->flags << 24) |+ (fec->enc_len));+ (*ptr)++;+ return 8;+ } else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {+ /* Packets with failed recovery indication carry:+ * <Flags:8, Bytes after ACKed seq lost:24>+ */+ **ptr = htonl((fec->flags << 24) |+ (fec->lost_len));+ (*ptr)++;+ return 8;+ } else if (fec->type) {+ /* Negotiation packets carry: <Encoding type:8> */+ **ptr = htonl((fec->type << 24) |+ (TCPOPT_NOP << 16) |+ (TCPOPT_NOP << 8) |+ TCPOPT_NOP);+ (*ptr)++;+ return 8;+ } else {+ /* All other packets carry: <Flags:8> */+ **ptr = htonl((fec->flags << 24) |+ (TCPOPT_NOP << 16) |+ (TCPOPT_NOP << 8) |+ TCPOPT_NOP);+ (*ptr)++;+ return 8;+ }+}++/* Processes the current packet in the buffer, treated as an FEC packet+ * (assumes that options were already processed)+ */+int tcp_fec_process(struct sock *sk, struct sk_buff *skb)+{+ struct tcp_sock *tp;+ struct tcphdr *th;+ int recovery_status, err;+ u32 end_seq;++ tp = tcp_sk(sk);+ th = tcp_hdr(skb);+ recovery_status = 0;++ /* drop packet if packet is not encoded */+ if (!(tp->rx_opt.fec.flags & TCP_FEC_ENCODED))+ return -1;++ /* check if all encoded packets were already received */+ end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;+ if (!after(end_seq, tp->rcv_nxt)) {+ tcp_fec_send_ack(sk, skb, FEC_NO_LOSS);+ return 0;+ }++ /* linearize the SKB (for easier payload access) */+ err = skb_linearize(skb);+ if (err)+ return err;++ /* data recovery */+ switch (tp->fec.type) {+ case TCP_FEC_TYPE_NONE:+ return -1;+ case TCP_FEC_TYPE_XOR_ALL:+ recovery_status = tcp_fec_process_xor(sk, skb, 0);+ break;+ case TCP_FEC_TYPE_XOR_SKIP_1:+ recovery_status = tcp_fec_process_xor(sk, skb, 1);+ break;+ }++ /* TODO error handling; -ENOMEM, etc. - disable FEC? */+ if (recovery_status < 0)+ return recovery_status;++ /* Send an explicit ACK if recovery failed */+ if (recovery_status == FEC_LOSS_UNRECOVERED)+ tcp_fec_send_ack(sk, skb, recovery_status);++ return 0;+}++/* Checks the received options for loss indicators and acts upon them.+ * In particular, the function handles recovery flags (indicators for+ * successful and failed recoveries, tail losses)+ * Returns: 1, if ACK contains a loss indicator+ */+int tcp_fec_check_ack(struct sock *sk, u32 ack_seq)+{+ struct tcp_sock *tp;++ tp = tcp_sk(sk);++ /* Clear local recovery indication (and ECN CWR demand)+ * if it was ACKED by the other node+ */+ if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_CWR) {+ tp->fec.flags &= ~TCP_FEC_RECOVERY_SUCCESSFUL;+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;+ }++ /* Check for tail loss indicators+ * This happens when FEC was unable to recover the lost data and+ * thus only sends an ACK with the loss range back. Everything not+ * ACKed/SACKed now, is considered lost now.+ */+ if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_FAILED) {+ tcp_fec_mark_skbs_lost(sk);+ return 1;+ }++ /* Check if the remote endpoint successfully recovered data,+ * if so we trigger a window reduction+ */+ if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_SUCCESSFUL) {+ /* Ignore flag if window was already reduced for the current+ * loss episode or if previous reduction was not signaled+ * yet (no outgoing packets)+ */+ if (after(ack_seq, tp->high_seq) &&+ !(tp->fec.flags & TCP_FEC_RECOVERY_CWR)) {+ tcp_fec_reduce_window(sk);+ tp->fec.flags |= TCP_FEC_RECOVERY_CWR;+ }++ return 1;+ }++ return 0;+}++/* Since data in the socket's receive queue can get consumed by other parties+ * we need to clone these SKBs until they are no longer required for possible+ * future recoveries. This function is called after the TCP header has been+ * removed from the SKB already. All parameters required for recovery are+ * stored in the SKB's control buffer.+ * @skb - buffer which is moved to the receive queue+ */+int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb)+{+ struct tcp_sock *tp;+ struct sk_buff *cskb;+ u32 data_len;+ int extra_bytes, err;+ tp = tcp_sk(sk);++ /* clone the SKB and add it to the FEC receive queue+ * (a simple extra reference to the SKB is not sufficient since+ * since SKBs can only be queued on one list at a time)+ */+ cskb = skb_clone(skb, GFP_ATOMIC);+ if (cskb == NULL)+ return -ENOMEM;++ /* linearize the SKB (for easier payload access) */+ err = skb_linearize(cskb);+ if (err)+ return err;++ data_len = skb->len;+ if (!data_len) {+ kfree_skb(cskb);+ return 0;+ }++ skb_queue_tail(&tp->fec.rcv_queue, cskb);+ tp->fec.bytes_rcv_queue += data_len;++ /* check if we can dereference old SKBs (as long as we have enough+ * data for future recoveries)+ */+ extra_bytes = tp->fec.bytes_rcv_queue - FEC_RCV_QUEUE_LIMIT;+ while (extra_bytes > 0) {+ cskb = skb_peek(&tp->fec.rcv_queue);+ if (cskb == NULL)+ return -EINVAL;++ data_len = TCP_SKB_CB(cskb)->end_seq - TCP_SKB_CB(cskb)->seq;+ if (data_len > extra_bytes) {+ break;+ } else {+ extra_bytes -= data_len;+ tp->fec.bytes_rcv_queue -= data_len;+ skb_unlink(cskb, &tp->fec.rcv_queue);+ kfree_skb(cskb);+ }+ }++ return 0;+}++/* Disables FEC for this connection (includes clearing references+ * to buffers in receive queue)+ */+void tcp_fec_disable(struct sock *sk)+{+ struct tcp_sock *tp = tcp_sk(sk);++ if (!tcp_fec_is_enabled(tp))+ return;++ tp->fec.type = 0;+ tp->fec.bytes_rcv_queue = 0;+ skb_queue_purge(&tp->fec.rcv_queue);+}++/* Processes the current packet in the buffer, treated as an FEC packet+ * with XOR-encoded payload (assumes that options were already processed)+ * Returns: negative code, if an error occurred;+ * positive code, otherwise (recovery status)+ * @block_skip - Number of unencoded blocks between two encoded blocks+ */+static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,+ unsigned int block_skip)+{+ struct sk_buff *pskb;+ struct tcp_sock *tp;+ struct tcphdr *th;+ u32 next_seq, end_seq, rec_seq;+ unsigned char *data, *block;+ unsigned int i, offset, data_len, block_len, rec_len;+ bool seen_loss;+ int ret;++ pskb = NULL;+ tp = tcp_sk(sk);+ th = tcp_hdr(skb);+ next_seq = tp->rx_opt.fec.enc_seq;+ end_seq = next_seq + tp->rx_opt.fec.enc_len;+ block_len = skb->len - tcp_hdrlen(skb);+ seen_loss = false;+ offset = 0;++ /* memory allocation for decoding / recovered SKB data */+ data = kmalloc(2 * block_len, GFP_ATOMIC);+ if (data == NULL)+ return -ENOMEM;++ block = data + block_len;++ /* copy FEC payload (skip TCP header) */+ memcpy(data, skb->data + tcp_hdrlen(skb), block_len);++ /* process in-sequence data */+ while ((data_len = tcp_fec_get_next_block(sk, &pskb,+ &tp->fec.rcv_queue, next_seq,+ min(block_len, end_seq - next_seq),+ block))) {+ next_seq += data_len;++ /* XOR with existing payload */+ for (i = 0; i < data_len; i++)+ data[i] ^= block[i];++ /* we could no read a whole MSS block, which means we+ * reached the end of the queue or end of range which the+ * FEC packet covers+ */+ if (data_len < block_len)+ break;++ /* skip unencoded blocks if there is more data encoded */+ if (end_seq - next_seq > 0)+ next_seq += block_len * block_skip;+ }++ /* check if all encoded bytes were already received */+ if (next_seq == end_seq) {+ kfree(data);+ return FEC_NO_LOSS;+ }++ /* we always recover one whole MSS block (otherwise slicing+ * would introduce a lot of additional complexity here) and handle+ * cut out already received sequences later+ */+ rec_seq = next_seq;+ rec_len = min(block_len, end_seq - rec_seq);+ offset = data_len;+ if ((rec_seq + rec_len) == end_seq)+ goto recover;++ next_seq += block_len * (block_skip + 1);+ pskb = NULL;++ /* read a possibly partial (smaller than MSS) block to fill up the+ * previously unfilled block and achieve alignment again+ */+ data_len = tcp_fec_get_next_block(sk, &pskb, &tp->out_of_order_queue,+ next_seq, block_len - offset, block);++ next_seq += data_len;++ /* check if we could not read as much data as requested */+ if ((next_seq != end_seq) && (data_len < (block_len - offset)))+ goto clean;++ /* XOR with existing payload */+ for (i = 0; i < data_len; i++)+ data[i+offset] ^= block[i];++ /* skip unencoded blocks if there is more data encoded */+ if (end_seq - next_seq > 0)+ next_seq += block_len * block_skip;++ /* read all necessary blocks to finish decoding */+ while ((data_len = tcp_fec_get_next_block(sk, &pskb,+ &tp->out_of_order_queue, next_seq,+ min(block_len, end_seq - next_seq),+ block))) {+ next_seq += data_len;++ /* XOR with existing payload */+ for (i = 0; i < data_len; i++)+ data[i] ^= block[i];++ /* we could not read a whole MSS block, which means we reached+ * the end of the queue or end of range which the FEC packet+ * covers+ */+ if (data_len < block_len)+ break;++ /* skip unencoded blocks if there is more data encoded */+ if (end_seq - next_seq > 0)+ next_seq += block_len * block_skip;+ }++ /* check if additional losses were observed (cannot recover) */+ if (next_seq != end_seq)+ goto clean;++recover:+ /* create and process recovered packets */+ for (i = 0; i < rec_len; i++)+ block[i] = data[(offset + i) % block_len];++ if (block_skip && ((block_len - offset) < rec_len)) {+ /* recover non-consecutive sequence ranges (only when+ * slicing is used)+ */+ u32 second_seq;+ unsigned int second_seq_len, first_seq_len;++ first_seq_len = block_len - offset;+ second_seq = rec_seq + first_seq_len + block_len * block_skip;+ second_seq_len = rec_len - first_seq_len;++ ret = tcp_fec_recover(sk, skb, block, rec_seq, first_seq_len);+ if (ret >= 0) {+ int second_ret = tcp_fec_recover(sk, skb,+ block + first_seq_len,+ second_seq, second_seq_len);+ if (second_ret < 0 || !ret)+ ret = second_ret;+ }+ } else {+ ret = tcp_fec_recover(sk, skb, block, rec_seq, rec_len);+ }++ kfree(data);+ return ret ? ret : FEC_LOSS_RECOVERED;++clean:+ kfree(data);+ return FEC_LOSS_UNRECOVERED;+}++/* Create a recovered packet and forward it to the reception routine */+static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,+ unsigned char *data, u32 seq, int len)+{+ struct sk_buff *rskb;+ struct tcp_sock *tp;++ tp = tcp_sk(sk);++ /* We will notify the remote node that recovery was successful */+ tp->fec.flags |= TCP_FEC_RECOVERY_SUCCESSFUL;++ /* Check if we received some tail of the recovered sequence already+ * by looking at the current SACK blocks (we don't want to recover+ * more data than necessary to prevent DSACKS)+ */+ if (tcp_is_sack(tp)) {+ int i;+ for (i = 0; i < tp->rx_opt.num_sacks; i++) {+ if (before(tp->selective_acks[i].start_seq,+ seq + len) &&+ !before(tp->selective_acks[i].end_seq,+ seq + len)) {+ len = tp->selective_acks[i].start_seq - seq;+ break;+ }+ }+ }++ /* We might have prematurely asked for a recovery in the case where the+ * whole recovery sequence is already covered by SACKs+ */+ if (len <= 0)+ return FEC_NO_LOSS;++ /* Create decoded packet and forward to reception routine */+ rskb = tcp_fec_make_decoded_pkt(sk, skb, data, seq, len);+ if (rskb == NULL)+ return -EINVAL;++ tcp_rcv_established(sk, rskb, tcp_hdr(rskb), rskb->len);+ return 0;+}++/* Sends an ACK for the FEC packet and encodes any congestion or+ * and/or recovery information+ */+static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,+ int recovery_status)+{+ struct tcp_sock *tp;+ u32 end_seq;++ tp = tcp_sk(sk);++ /* Right now we only need an outgoing ACK if FEC recovery failed,+ * in all other cases ACKs are implicitly generated+ */+ switch (recovery_status) {+ case FEC_LOSS_UNRECOVERED:+ end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;+ tp->fec.flags |= TCP_FEC_RECOVERY_FAILED;+ tp->fec.lost_len = end_seq - tp->rcv_nxt;+ tcp_send_ack(sk);+ break;+ }+}++/* Reduces the congestion window (similar to completed fast recovery)+ * If the node is already in recovery mode, undo is disabled to enforce+ * the window reduction upon completion+ */+static void tcp_fec_reduce_window(struct sock *sk)+{+ struct tcp_sock *tp;+ const struct inet_connection_sock *icsk;++ tp = tcp_sk(sk);+ icsk = inet_csk(sk);++ if (icsk->icsk_ca_state < TCP_CA_CWR) {+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);+ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);+ tp->snd_cwnd_stamp = tcp_time_stamp;+ }++ /* Any future window reduction requests are ignored until+ * snd_nxt is ACKed+ */+ tp->high_seq = tp->snd_nxt;+ tp->undo_marker = 0;+ } else {+ /* Socket is in some congestion mode and we only need to make+ * sure that window reduction is executed when recovery+ * is finished+ */+ tp->undo_marker = 0;+ }+}++/* The incoming ACK indicates a failed recovery.+ * Mark all unacked SKBs in the loss range as lost.+ * TODO With interleaved coding, we have the additional constraint+ * that the SKBs in the loss range also must have been encoded the+ * triggering FEC packet, and for that we need to keep some info+ * about FEC packets on the sender side+ */+static void tcp_fec_mark_skbs_lost(struct sock *sk)+{+ struct tcp_sock *tp;+ struct sk_buff *skb;+ u32 start_seq, end_seq;++ tp = tcp_sk(sk);+ skb = tp->lost_skb_hint ? tp->lost_skb_hint : tcp_write_queue_head(sk);++ /* All SKBs falling completely in the range are marked */+ start_seq = tp->rx_opt.fec.lost_seq;+ end_seq = tp->rx_opt.fec.lost_seq + tp->rx_opt.fec.lost_len;++ tcp_for_write_queue_from(skb, sk) {+ if (skb == tcp_send_head(sk))+ break;++ /* Past loss range */+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))+ break;++ /* SKB not (fully) within range */+ if (before(TCP_SKB_CB(skb)->seq, start_seq) ||+ after(TCP_SKB_CB(skb)->end_seq, end_seq))+ continue;++ /* SKB already marked */+ if (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))+ continue;++ /* Verify retransmit hint before marking+ * (see tcp_verify_retransmit_hint(),+ * copied since method defined static in tcp_input.c)+ */+ if ((tp->retransmit_skb_hint == NULL) ||+ before(TCP_SKB_CB(skb)->seq,+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))+ tp->retransmit_skb_hint = skb;++ if (!tp->lost_out ||+ after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))+ tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;++ /* Mark SKB as lost (see tcp_skb_mark_lost()) */+ tp->lost_out += tcp_skb_pcount(skb);+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;+ }++ tcp_verify_left_out(tp);+}++/* Searches for the FEC option in the packet header and replaces+ * the long option with a short one padded by NOPs.+ * This is done to convert the option used by an encoded packet+ * to the option used by a recovered packet.+ */+static bool tcp_fec_update_decoded_option(struct sk_buff *skb)+{+ struct tcphdr *th;+ unsigned char *ptr;+ int length;++ th = tcp_hdr(skb);+ ptr = (unsigned char *) (th + 1);+ length = (th->doff * 4) - sizeof(struct tcphdr);++ while (length > 0) {+ int opcode = *ptr++;+ int opsize;++ switch (opcode) {+ case TCPOPT_EOL:+ return 0;+ case TCPOPT_NOP:+ length--;+ continue;+ default:+ opsize = *ptr++;+ if (opsize < 2 || opsize > length)+ return 0;++ if (opcode == TCPOPT_EXP &&+ get_unaligned_be16(ptr) == TCPOPT_FEC_MAGIC) {+ /* Update FEC option:+ * 1. Convert long option into short option+ * 2. Clear ENCODED flag (keep other flags)+ * 3. Replace option value (long option) by NOPs+ */+ u32 *fec_opt_start = (u32 *) (ptr - 2);+ *fec_opt_start = htonl((+ get_unaligned_be32(fec_opt_start) &+ 0xFF00FFFF) | 0x00050000);+ *(fec_opt_start + 1) = htonl((+ get_unaligned_be32(fec_opt_start + 1) &+ 0xEF000000) | 0x00010101);++ return 1;+ }++ ptr += opsize - 2;+ length -= opsize;+ }+ }++ return 0;+}++/* Allocates an SKB for data we want to forward to reception routines+ * (recovered data) by making a copy of the FEC SKB and replacing the data+ * part, all other segments (options, etc.) are preserved+ */+static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,+ const struct sk_buff *skb,+ unsigned char *dec_data,+ u32 seq, unsigned int len)+{+ struct tcp_sock *tp;+ struct sk_buff *nskb;++ tp = tcp_sk(sk);+ nskb = skb_copy(skb, GFP_ATOMIC);+ if (nskb == NULL)+ return NULL;++ /* Update FEC option for the new packet */+ if (!tcp_fec_update_decoded_option(nskb)) {+ /* TODO Do we need this catch? Technically we don't reach this+ * method if there is no FEC option in the header.+ */+ return NULL;+ }++ /* check if we received some tail of the recovered sequence already+ * by looking at the current SACK blocks (we don't want to recover+ * more data than necessary to prevent DSACKS)+ */+ if (tcp_is_sack(tp)) {+ int i;+ for (i = 0; i < tp->rx_opt.num_sacks; i++) {+ if (before(tp->selective_acks[i].start_seq,+ seq + len) &&+ !before(tp->selective_acks[i].end_seq,+ seq + len)) {+ len = tp->selective_acks[i].start_seq - seq;+ break;+ }+ }+ }++ /* trim data section to fit recovered sequence if necessary */+ if (len < (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq))+ skb_trim(nskb, len + tcp_hdrlen(nskb));++ /* fix the sequence numbers */+ tcp_hdr(nskb)->seq = htonl(seq);+ tcp_hdr(nskb)->ack_seq = htonl(tp->snd_una);+ TCP_SKB_CB(nskb)->seq = seq;+ TCP_SKB_CB(nskb)->end_seq = seq + len;++ /* replace SKB payload with recovered data */+ memcpy(nskb->data + tcp_hdrlen(nskb), dec_data, len);++ /* packets used for recovery had their checksums checked already */+ nskb->ip_summed = CHECKSUM_UNNECESSARY;++ return nskb;+}++/* Gets the next byte block from an SKB queue (any SKB which is touched+ * in this procedure will be linearized to simplify payload access)+ * @skb - Points to SKB from which previous block was extracted (useful+ * for successive calls to this function, which avoids moving through+ * the whole queue again)+ * @queue - SKB queue to read from (SKB has to point to an element on this+ * queue)+ * @seq - Sequence number of first byte in the block+ * @block_len+ * @block+ *+ * Returns the bytes written to the block memory+ */+static unsigned int tcp_fec_get_next_block(struct sock *sk,+ struct sk_buff **skb,+ struct sk_buff_head *queue, u32 seq,+ unsigned int block_len, unsigned char *block)+{+ unsigned int cur_len, offset, num_bytes;+ int err;+ u32 end_seq;++ cur_len = 0;++ /* Get first SKB of the write queue and specify next sequence to+ * encode+ */+ if (*skb == NULL) {+ *skb = skb_peek(queue);+ if (*skb == NULL)+ return 0;+ }++ /* move to SKB which stores the next sequence to encode */+ while (*skb) {+ /* If we observe an RST/SYN, we stop here to avoid+ * handling corner cases+ */+ if (TCP_SKB_CB(*skb)->tcp_flags &+ (TCPHDR_RST |+ TCPHDR_SYN))+ return 0;+ if (!before(seq, TCP_SKB_CB(*skb)->seq) &&+ before(seq, TCP_SKB_CB(*skb)->end_seq))+ break;+ if (*skb == skb_peek_tail(queue)) {+ *skb = NULL;+ break;+ }++ *skb = skb_queue_next(queue, *skb);+ }++ if (*skb == NULL)+ return 0;++ /* copy bytes from SKBs (connected sequences) */+ while (*skb && (cur_len < block_len)) {+ err = skb_linearize(*skb);+ if (err)+ return err;++ /* Deal with the end seq number being incremented by+ * one if the FIN flag is set (we don't want to encode this)+ */+ end_seq = TCP_SKB_CB(*skb)->end_seq;+ if (TCP_SKB_CB(*skb)->tcp_flags & TCPHDR_FIN)+ end_seq--;++ if ((seq >= TCP_SKB_CB(*skb)->seq) && (seq < end_seq)) {+ /* Copy data depending on:+ * - remaining space in the block+ * - remaining data in the SKB+ */+ offset = seq - TCP_SKB_CB(*skb)->seq;+ num_bytes = min(block_len - cur_len,+ end_seq - seq);++ memcpy(block + cur_len, (*skb)->data + offset,+ num_bytes);+ cur_len += num_bytes;+ seq += num_bytes;+ }++ if (*skb == skb_peek_tail(queue) || cur_len >= block_len)+ break;++ *skb = skb_queue_next(queue, *skb);+ }++ return cur_len;+}++/* Arms the timer for a delayed FEC transmission if there is+ * no earlier timeout defined (i.e. retransmission timeout)+ */+void tcp_fec_arm_timer(struct sock *sk)+{+ struct inet_connection_sock *icsk;+ struct tcp_sock *tp;+ u32 delta, timeout, rtt;++ icsk = inet_csk(sk);+ tp = tcp_sk(sk);++ /* Only arm a timer if connection is established */+ if (sk->sk_state != TCP_ESTABLISHED)+ return;++ /* Forward next sequence to be encoded if unencoded data was acked */+ if (after(tp->snd_una, tp->fec.next_seq))+ tp->fec.next_seq = tp->snd_una;++ /* Don't arm the timer if there is no unencoded data left */+ if (!before(tp->fec.next_seq, tp->snd_nxt))+ return;++ /* TODO handle other timers which might be armed;+ * EARLY_RETRANS? LOSS_PROBE?+ */++ /* Compute timeout (currently 0.25 * RTT) */+ rtt = tp->srtt_us >> 3;+ timeout = rtt >> 2;++ /* Compute delay between transmission of original packet and this call+ * (difference is subtracted from timeout value)+ */+ delta = 0;+ if (delta > timeout) {+ tcp_fec_invoke_nodelay(sk);+ return;+ } else if (delta > 0) {+ timeout -= delta;+ }++ /* Do not replace a timeout occurring earlier */+ if (jiffies + timeout >= icsk->icsk_timeout)+ return;++ inet_csk_reset_xmit_timer(sk, ICSK_TIME_FEC, timeout, TCP_RTO_MAX);+}++/* The FEC timer fired. Force an FEC transmission for the+ * last unencoded burst. Rearm the RTO timer (which was switched+ * out when setting the FEC timer). Set a new FEC timer if there+ * is pending unencoded data.+ */+void tcp_fec_timer(struct sock *sk)+{+ struct inet_connection_sock *icsk;+ struct tcp_sock *tp;++ icsk = inet_csk(sk);+ tp = tcp_sk(sk);++ tcp_fec_invoke_nodelay(sk);++ icsk->icsk_pending = 0;+ tcp_rearm_rto(sk);++ tcp_fec_arm_timer(sk);+}++/* If FEC packet transmissions are delayed set a timer+ * (if not already set), otherwise invoke the FEC mechanism+ * immediately+ */+int tcp_fec_invoke(struct sock *sk)+{+ struct inet_connection_sock *icsk;+ struct tcp_sock *tp;++ icsk = inet_csk(sk);+ tp = tcp_sk(sk);++#ifndef TCP_FEC_DELAYED_SEND+ return tcp_fec_invoke_nodelay(sk);+#else+ /* Set the timer for sending an FEC packet if no FEC+ * timer is active yet+ */+ if (!icsk->icsk_pending || icsk->icsk_pending != ICSK_TIME_FEC)+ tcp_fec_arm_timer(sk);+#endif++ return 0;+}++/* Invokes the FEC mechanism set for the connection;+ * Creates and sends out FEC packets+ */+int tcp_fec_invoke_nodelay(struct sock *sk)+{+ int err;+ struct sk_buff_head *list;+ struct sk_buff *skb;+ struct tcp_fec *fec;++ list = kmalloc(sizeof(struct sk_buff_head), GFP_ATOMIC);+ if (list == NULL)+ return -ENOMEM;++ skb_queue_head_init(list);+ err = tcp_fec_create(sk, list);+ if (err)+ goto clean;++ err = tcp_fec_xmit_all(sk, list);+ if (err)+ goto clean;++clean:+ /* Purge all SKBs (purge FEC structs first) */+ skb = (struct sk_buff *) list;+ while (!skb_queue_is_last(list, skb)) {+ skb = skb_queue_next(list, skb);+ fec = TCP_SKB_CB(skb)->fec;+ if (fec != NULL) {+ kfree(fec);+ TCP_SKB_CB(skb)->fec = NULL;+ }+ }++ skb_queue_purge(list);+ kfree(list);++ /* TODO error handling; -ENOMEM, etc. - disable FEC? */++ return err;+}++/* Creates one or more FEC packets (can depend on the FEC type used)+ * and puts them in a queue+ * @list: queue head+ */+static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list)+{+ struct tcp_sock *tp;+ unsigned int first_seq, block_len;+ int err;++ tp = tcp_sk(sk);++ /* Update the pointer to the first byte to be encoded next+ * (this only matters when a packet was ACKed before it was+ * encoded)+ */+ if (after(tp->snd_una, tp->fec.next_seq))+ tp->fec.next_seq = tp->snd_una;++ first_seq = tp->fec.next_seq;+ block_len = tcp_current_mss(sk);++ switch (tp->fec.type) {+ case TCP_FEC_TYPE_NONE:+ return 0;+ case TCP_FEC_TYPE_XOR_ALL:+ return tcp_fec_create_xor(sk, list, first_seq,+ block_len, 0,+ FEC_RCV_QUEUE_LIMIT - block_len);+ case TCP_FEC_TYPE_XOR_SKIP_1:+ err = tcp_fec_create_xor(sk, list, first_seq, block_len, 1,+ FEC_RCV_QUEUE_LIMIT - block_len);+ if (err)+ return err;++ return tcp_fec_create_xor(sk, list, first_seq + block_len,+ block_len, 1,+ FEC_RCV_QUEUE_LIMIT - block_len);+ }++ return 0;+}++/* Creates FEC packet(s) using XOR encoding+ * (allocates memory for the FEC structs)+ * @first_seq - Sequence number of first byte to be encoded+ * @block_len - Block length (typically MSS)+ * @block_skip - Number of unencoded blocks between two encoded blocks+ * @max_encoded_per_pkt - maximum number of blocks encoded per packet+ * (0, if unlimited)+ */+static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,+ unsigned int first_seq, unsigned int block_len,+ unsigned int block_skip,+ unsigned int max_encoded_per_pkt)+{+ struct tcp_sock *tp;+ struct sk_buff *skb, *fskb;+ struct tcp_fec *fec;+ unsigned int c_encoded; /* Number of currently encoded blocks+ not yet added to an FEC packet */+ unsigned int next_seq; /* Next byte to encode */+ unsigned int i;+ unsigned char *data, *block;+ u16 data_len;++ tp = tcp_sk(sk);+ skb = NULL;+ c_encoded = 0;+ next_seq = first_seq;++ /* memory allocation+ * data - used temporarily to obtain byte blocks and store the payload+ (is freed before returning; we need two blocks here to store+ the previously XORed data that has not been added to an FEC+ packet yet, and the new to-be XORed data extracted from one+ or more existing buffers)++ * fec - used to store the FEC parameters+ (is freed after the corresponding packet is forwarded to the+ transmission routine)+ */+ data = kmalloc(2 * block_len, GFP_ATOMIC);+ if (data == NULL)+ return -ENOMEM;++ fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);+ if (fec == NULL) {+ kfree(data);+ return -ENOMEM;+ }++ memset(data, 0, 2 * block_len);+ memset(fec, 0, sizeof(struct tcp_fec));++ block = data + block_len;++ /* encode data blocks+ * XXX atomicity check?+ */+ fec->enc_seq = next_seq;+ while ((data_len = tcp_fec_get_next_block(sk, &skb,+ &sk->sk_write_queue, next_seq,+ min(block_len, tp->snd_nxt - next_seq),+ block))) {+ /* Check if we reached the encoding limit; then create packet+ * with current payload and add it to the queue+ */+ if (max_encoded_per_pkt > 0 &&+ c_encoded >= max_encoded_per_pkt) {+ fskb = tcp_fec_make_encoded_pkt(sk, fec, data,+ block_len);+ if (fskb == NULL) {+ kfree(data);+ kfree(fec);+ return -EINVAL;+ }++ skb_queue_tail(list, fskb);+ memset(data, 0, block_len);+ c_encoded = 0;++ /* memory allocation for the FEC struct of the next+ * packet+ */+ fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);+ if (fec == NULL) {+ kfree(data);+ return -ENOMEM;+ }++ memset(fec, 0, sizeof(struct tcp_fec));+ fec->enc_seq = next_seq;+ }++ next_seq += data_len;+ fec->enc_len = next_seq - fec->enc_seq;++ /* encode block into existing payload (XOR) */+ for (i = 0; i < data_len; i++)+ data[i] ^= block[i];++ c_encoded++;++ /* skip over blocks which are not requested for encoding */+ next_seq += block_len * block_skip;+ }++ /* create final packet if some data was selected for encoding */+ if (c_encoded > 0) {+ fskb = tcp_fec_make_encoded_pkt(sk, fec, data, block_len);+ if (fskb == NULL) {+ kfree(data);+ kfree(fec);+ return -EINVAL;+ }++ skb_queue_tail(list, fskb);+ } else {+ kfree(fec);+ }++ tp->fec.next_seq = next_seq;+ kfree(data);++ return 0;+}++/* Allocates an SKB for data we want to send and assigns+ * the necessary options and fields+ */+static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,+ struct tcp_fec *fec,+ unsigned char *enc_data,+ unsigned int len)+{+ struct sk_buff *skb;+ unsigned char *data;++ /* See tcp_make_synack(); 15 probably for tail pointer etc.? */+ len = min(len, fec->enc_len);+ skb = alloc_skb(MAX_TCP_HEADER + 15 + len, GFP_ATOMIC);+ if (skb == NULL)+ return NULL;++ /* Reserve space for headers */+ skb_reserve(skb, MAX_TCP_HEADER);++ /* Specify sequence number and FEC struct address in control buffer */+ fec->flags |= TCP_FEC_ENCODED;+ TCP_SKB_CB(skb)->seq = fec->enc_seq;+ TCP_SKB_CB(skb)->fec = fec;++ /* Enable ACK flag (required for all data packets) */+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_ACK;++ /* Set GSO parameters */+ skb_shinfo(skb)->gso_segs = 1;+ skb_shinfo(skb)->gso_size = 0;+ skb_shinfo(skb)->gso_type = 0;++ /* Append payload to SKB */+ data = skb_put(skb, len);+ memcpy(data, enc_data, len);++ skb->ip_summed = CHECKSUM_PARTIAL;++ return skb;+}++/* Transmit all FEC packets in a list */+static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list)+{+ struct sk_buff *skb;+ int err;++ if (list == NULL || skb_queue_empty(list))+ return 0;++ skb = (struct sk_buff *) list;+ while (!skb_queue_is_last(list, skb)) {+ skb = skb_queue_next(list, skb);+ err = tcp_fec_xmit(sk, skb);+ if (err)+ return err;+ }++ return 0;+}++/* Transmits an FEC packet */+static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb)+{+ /* TODO timers? no retransmissions, but want to deactivate FEC+ * if we never get any FEC ACKs back+ */+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);+}diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.cindex f83ddf9..b640461 100644--- a/net/ipv4/tcp_input.c+++ b/net/ipv4/tcp_input.c@@ -70,6 +70,7 @@ #include <linux/kernel.h> #include <net/dst.h> #include <net/tcp.h>+#include <net/tcp_fec.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h>@@ -106,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */+#define FLAG_FEC_CWR_REQUESTED 0x80 /* cwnd reduction requested */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */@@ -115,8 +117,9 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_FEC_CWR_REQUESTED) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)+#define FLAG_CONGESTION (FLAG_ECE|FLAG_FEC_CWR_REQUESTED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))@@ -2546,7 +2549,11 @@ void tcp_enter_cwr(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0;- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {++ /*+ * !!! TCP FEC patch !!!+ */+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR && after(tp->snd_una, tp->high_seq)) { tp->undo_marker = 0; tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR);@@ -2968,6 +2975,12 @@ void tcp_rearm_rto(struct sock *sk) if (tp->fastopen_rsk) return;+ /* Don't rearm the timer if an FEC timer is active.+ * The FEC handler will rearm the timer once the event is handled.+ */+ if (icsk->icsk_pending == ICSK_TIME_FEC)+ return;+ if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else {@@ -3228,16 +3241,23 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) {+ const struct tcp_sock *tp = tcp_sk(sk); if (tcp_in_cwnd_reduction(sk)) return false;+ /*+ * !!! TCP FEC patch !!!+ */+ if ((flag & FLAG_CONGESTION) && !(tp->snd_cwnd < tp->snd_ssthresh))+ return false;+ /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */- if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)+ if (tp->reordering > sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED;@@ -3425,6 +3445,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; }+ /* Check if FEC expects and executes a window reduction */+ if (tcp_fec_is_enabled(tp) && tcp_fec_check_ack(sk, ack))+ flag |= FLAG_FEC_CWR_REQUESTED;+ prior_fackets = tp->fackets_out; /* ts_recent update must be made after we are sure that the packet@@ -3656,6 +3680,20 @@ void tcp_parse_options(const struct sk_buff *skb, break; case TCPOPT_EXP:+ /*+ * !!! TCP FEC patch !!!+ */+ if (sysctl_tcp_fec &&+ get_unaligned_be16(ptr) ==+ TCPOPT_FEC_MAGIC) {+ tcp_fec_decode_option(&(opt_rx->fec),+ ntohl(th->seq),+ ntohl(th->ack_seq), th->syn,+ ptr + 2,+ opsize - TCPOLEN_EXP_FEC_BASE);+ break;+ }+ /* Fast Open option shares code 254 using a * 16 bits magic number. */@@ -4173,6 +4211,12 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);+ /*+ * !!! TCP FEC patch !!!+ */+ if (tcp_fec_is_enabled(tp))+ tcp_fec_update_queue(sk, skb);+ tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;@@ -4410,6 +4454,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto out_of_window; /* Ok. In sequence. In window. */+ if (tcp_fec_is_enabled(tp))+ tcp_fec_update_queue(sk, skb);+ if (tp->ucopy.task == current && tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && sock_owned_by_user(sk) && !tp->urg_data) {@@ -4715,6 +4762,12 @@ static int tcp_prune_queue(struct sock *sk) tp->copied_seq, tp->rcv_nxt); sk_mem_reclaim(sk);+ /* Disable FEC if it was enabled to prevent keeping data+ * in the receive queue longer than necessary+ */+ if (tcp_fec_is_enabled(tp))+ tcp_fec_disable(sk);+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0;@@ -4998,6 +5051,21 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* Reset is accepted even if it did not pass PAWS. */ }+ /* Special processing if FEC is enabled */+ if (tcp_fec_is_enabled(tp)) {+ if (tcp_fec_is_encoded(tp)) {+ tcp_fec_process(sk, skb);+ goto discard;+ } else if (!tp->rx_opt.fec.saw_fec && th->ack &&+ sk->sk_state == TCP_LAST_ACK) {+ /* TODO Sometimes the FEC option is not appended to the+ * FIN-ACK packet; socket options cleared?+ */+ tcp_ack(sk, skb, FLAG_SLOWPATH);+ goto discard;+ }+ }+ /* Step 1: check sequence number */ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { /* RFC793, page 37: "In all states except SYN-SENT, all reset@@ -5099,6 +5167,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tp->rx_opt.saw_tstamp = 0;+ tp->rx_opt.fec.saw_fec = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made@@ -5461,6 +5530,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp);+ /*+ * FEC negotiation+ * Disable FEC if both ends do not agree on the FEC type used+ */+ if (tp->fec.type != tp->rx_opt.fec.type) {+ tp->fec.type = 0;+ tp->rx_opt.fec.type = 0;+ }+ tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk);@@ -5735,6 +5813,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp);++ /* SYN requested FEC usage */+ if (tp->rx_opt.fec.type > 0)+ tp->fec.type = tp->rx_opt.fec.type;+ break; case TCP_FIN_WAIT1: {diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.cindex d886b60..5efbc2e 100644--- a/net/ipv4/tcp_ipv4.c+++ b/net/ipv4/tcp_ipv4.c@@ -73,6 +73,9 @@ #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h>++#include <net/tcp_fec.h>+ #include <net/tcp_memcontrol.h> #include <net/busy_poll.h>@@ -212,6 +215,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;+ memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));+ /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and@@ -2270,7 +2275,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||+ icsk->icsk_pending == ICSK_TIME_FEC) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.cindex 927586e..f59faf9 100644--- a/net/ipv4/tcp_minisocks.c+++ b/net/ipv4/tcp_minisocks.c@@ -552,6 +552,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0;+ newtp->high_seq = newtp->snd_nxt;++ /* TCP FEC option */+ newtp->rx_opt.fec.type = sysctl_tcp_fec ? req->fec_type : 0;+ newtp->fec.type = newtp->fec.flags = 0;+ newtp->fec.next_seq = newtp->snd_nxt;+ newtp->fec.bytes_rcv_queue = 0;+ skb_queue_head_init(&newtp->fec.rcv_queue);+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk;diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.cindex ddd2a6f..7791899 100644--- a/net/ipv4/tcp_output.c+++ b/net/ipv4/tcp_output.c@@ -37,6 +37,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h>+#include <net/tcp_fec.h> #include <linux/compiler.h> #include <linux/gfp.h>@@ -65,6 +66,12 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;+/*+ * !!! TCP FEC patch !!!+ */+int sysctl_tcp_fec __read_mostly;++ unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);@@ -422,6 +429,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8)+#define OPTION_FEC (1 << 9) struct tcp_out_options { u16 options; /* bit field of OPTION_* */@@ -432,6 +440,7 @@ struct tcp_out_options { __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */+ struct tcp_fec fec; /* FEC parameters */ }; /* Write previously computed TCP options to the packet.@@ -540,6 +549,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } ptr += (len + 3) >> 2; }++ if (unlikely(OPTION_FEC & options))+ tcp_fec_encode_option(tp, &(opts->fec), &ptr); } /* Compute TCP options for SYN packets. This is not the final@@ -607,6 +619,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } }+ /* Prepare for FEC negotation if requested */+ if (unlikely(tcp_fec_is_enabled(tp)) &&+ remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {+ opts->options |= OPTION_FEC;+ opts->fec.type = tp->fec.type;+ remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;+ }+ return MAX_TCP_OPTION_SPACE - remaining; }@@ -671,6 +691,16 @@ static unsigned int tcp_synack_options(struct sock *sk, } }+ /* Handle request for FEC support from other side+ * (respond with same FEC option if FEC is locally supported)+ */+ if (sysctl_tcp_fec && unlikely(req->fec_type) &&+ remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {+ opts->options |= OPTION_FEC;+ opts->fec.type = req->fec_type;+ remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;+ }+ return MAX_TCP_OPTION_SPACE - remaining; }@@ -681,6 +711,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb struct tcp_out_options *opts, struct tcp_md5sig_key **md5) {+ struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks;@@ -715,6 +746,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; }+ /* Prepare option if connection has FEC enabled */+ if (tcp_fec_is_enabled(tp)) {+ opts->options |= OPTION_FEC;+ if (tcb && tcb->fec)+ opts->fec = *(tcb->fec);++ /* regardless of packet type we need 4 more bytes+ * including alignment+ */+ size += 4;+ size += TCPOLEN_EXP_FEC_BASE;+ }+ return size; }@@ -895,7 +939,7 @@ void tcp_wfree(struct sk_buff *skb) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk);@@ -2055,6 +2099,9 @@ repair: break; }+ if (tcp_fec_is_enabled(tp))+ tcp_fec_invoke(sk);+ if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts;@@ -3153,6 +3200,12 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq;++ /* Initialize FEC members */+ tp->fec.next_seq = tp->snd_nxt;+ tp->fec.bytes_rcv_queue = 0;+ skb_queue_head_init(&tp->fec.rcv_queue);+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.cindex dceaacc..b78ea8f 100644--- a/net/ipv4/tcp_timer.c+++ b/net/ipv4/tcp_timer.c@@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h>+#include <net/tcp_fec.h> int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;@@ -478,7 +479,15 @@ out_reset_timer: if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk);-out:;+out:+ /* FEC will switch out the RTO timer if a delayed FEC transmission+ * should happen earlier than this. RTO timer will be switched in+ * once the FEC timer fired.+ * FEC transmissions during a loss episode require that the sysctl+ * value is >= 2.+ */+ if (tcp_fec_is_enabled(tp) && sysctl_tcp_fec >= 2)+ tcp_fec_arm_timer(sk); } void tcp_write_timer_handler(struct sock *sk)@@ -503,6 +512,9 @@ void tcp_write_timer_handler(struct sock *sk) case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); break;+ case ICSK_TIME_FEC:+ tcp_fec_timer(sk);+ break; case ICSK_TIME_RETRANS: icsk->icsk_pending = 0; tcp_retransmit_timer(sk);diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.cindex c5078c5..d5205c6 100644--- a/net/ipv6/tcp_ipv6.c+++ b/net/ipv6/tcp_ipv6.c@@ -288,6 +288,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);+ memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));+ inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT);