All pastes #3453984 Raw Edit

TCP FEC patch modified to compil

public unlisted c v1 · immutable
#3453984 ·published 2016-04-06 13:55 UTC
rendered paste body
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.hindex 89bcfe8..9298207 100644--- a/include/linux/skbuff.h+++ b/include/linux/skbuff.h@@ -533,8 +533,10 @@ struct sk_buff { 	 * layer. Please put your private variables there. If you 	 * want to keep them across layers you have to do a skb_clone() 	 * first. This is owned by whoever has the skb queued ATM.+	 *+	 * Increased the CB to hold pointer to an FEC structure. 	 */-	char			cb[48] __aligned(8);+	char			cb[56] __aligned(8);  	unsigned long		_skb_refdst; #ifdef CONFIG_XFRMdiff --git a/include/linux/tcp.h b/include/linux/tcp.hindex 4345d49..ccc0e91 100644--- a/include/linux/tcp.h+++ b/include/linux/tcp.h@@ -79,6 +79,24 @@ struct tcp_sack_block { #define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/ #define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/ +/* Flags transmitted in the first FEC option byte after magic bytes+ * (except if option is used for negotiation) */+#define TCP_FEC_RECOVERY_CWR		0x80	/* Recovery triggered CWR */+#define TCP_FEC_RECOVERY_SUCCESSFUL	0x40	/* Local recovery done	  */+#define TCP_FEC_RECOVERY_FAILED		0x20	/* Local recovery failed  */+#define TCP_FEC_ENCODED			0x10	/* Packet is FEC-encoded  */++struct tcp_fec {+	u8	type;		/* Requested FEC type (negotiation only,+				 * see net/tcp_fec.h for type defs)	 */+	u32	enc_seq;	/* Sequence number of first encoded byte */+	u32	enc_len;	/* Encoding length			 */+	u32	lost_seq;	/* Sequence number of first lost byte	 */+	u32	lost_len;	/* Loss length				 */+	u8	flags;		/* See flag definitions above		 */+	bool	saw_fec;	/* FEC option was retrieved from packet	 */+};+ struct tcp_options_received { /*	PAWS/RTTM data	*/ 	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */@@ -95,12 +113,14 @@ struct tcp_options_received { 	u8	num_sacks;	/* Number of SACK blocks		*/ 	u16	user_mss;	/* mss requested by user in ioctl	*/ 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */+	struct tcp_fec fec;	/* FEC-related parameters		*/ };  static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0; 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;+	memset(&(rx_opt->fec), 0, sizeof(struct tcp_fec)); }  /* This is the max number of SACKS that we'll generate and process. It's safe@@ -327,6 +347,24 @@ struct tcp_sock { 	 * socket. Used to retransmit SYNACKs etc. 	 */ 	struct request_sock *fastopen_rsk;++/* TCP FEC parameters+ *	type - negotiated FEC type to be used+ *	next_seq - next sequence which was not FEC-encoded before+ *	lost_len - bytes after rcv_nxt considered lost+ *	flags - see TCP_FEC_* flag definitions above+ *	bytes_rcv_queue - number of bytes stored in queued SKBs+ *	rcv_queue - copies from the socket's receive queue kept for+ *		FEC recovery+ */+	struct {+		u8 type;+		u32 next_seq;+		u32 lost_len;+		u8 flags;+		u32 bytes_rcv_queue;+		struct sk_buff_head rcv_queue;+	} fec; };  enum tsq_flags {diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.hindex 06d0d0f..063aa59 100644--- a/include/net/inet_connection_sock.h+++ b/include/net/inet_connection_sock.h@@ -138,6 +138,7 @@ struct inet_connection_sock { #define ICSK_TIME_PROBE0	3	/* Zero window probe timer */ #define ICSK_TIME_EARLY_RETRANS 4	/* Early retransmit timer */ #define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */+#define ICSK_TIME_FEC		6	/* FEC delayed send timer */  static inline struct inet_connection_sock *inet_csk(const struct sock *sk) {@@ -228,7 +229,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, 	}  	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||-	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {+	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE ||+	    what == ICSK_TIME_FEC) { 		icsk->icsk_pending = what; 		icsk->icsk_timeout = jiffies + when; 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);diff --git a/include/net/request_sock.h b/include/net/request_sock.hindex 610fa9e..1c1b4ba 100644--- a/include/net/request_sock.h+++ b/include/net/request_sock.h@@ -63,6 +63,8 @@ struct request_sock { 	struct sock			*sk; 	u32				secid; 	u32				peer_secid;+	u8				fec_type; /* Encoding type (see+						   * net/tcp_fec.h) */ };  static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)diff --git a/include/net/tcp.h b/include/net/tcp.hindex d59f206..f894889 100644--- a/include/net/tcp.h+++ b/include/net/tcp.h@@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt  */ #define TCPOPT_FASTOPEN_MAGIC	0xF989+#define TCPOPT_FEC_MAGIC	0xDC60  /*  *     TCP option lengths@@ -195,6 +196,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_FASTOPEN_BASE  2 #define TCPOLEN_EXP_FASTOPEN_BASE  4 +/*+ *	!!! TCP FEC patch !!!+ */+#define TCPOLEN_EXP_FEC_BASE   4+ /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED		12 #define TCPOLEN_WSCALE_ALIGNED		4@@ -204,6 +210,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERBLOCK		8 #define TCPOLEN_MD5SIG_ALIGNED		20 #define TCPOLEN_MSS_ALIGNED		4+#define TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED	8  /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */@@ -230,6 +237,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define	TFO_SERVER_WO_SOCKOPT1	0x400 #define	TFO_SERVER_WO_SOCKOPT2	0x800 +/*+ *      !!! TCP FEC patch !!!+ */++/* Maximum number of in-order bytes kept in the receiver's buffer for FEC+ * recoveries. The sender will never send more than this in a single FEC+ * packet. */+#define FEC_RCV_QUEUE_LIMIT    16000+ extern struct inet_timewait_death_row tcp_death_row;  /* sysctl variables for tcp */@@ -274,6 +290,12 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit;++/*+ *      !!! TCP FEC patch !!!+ */+extern int sysctl_tcp_fec;+ extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking;@@ -725,6 +747,7 @@ struct tcp_skb_cb { 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/ 	/* 1 byte hole */ 	__u32		ack_seq;	/* Sequence number ACK'd	*/+	struct tcp_fec	*fec;		/* FEC parameters		*/ };  #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))@@ -1131,6 +1154,11 @@ static inline void tcp_openreq_init(struct request_sock *req, 	ireq->ecn_ok = 0; 	ireq->ir_rmt_port = tcp_hdr(skb)->source; 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);+	+	/*+	 *      !!! TCP FEC patch !!!+	 */+	req->fec_type = rx_opt->fec.type; }  extern void tcp_openreq_init_rwin(struct request_sock *req,diff --git a/include/net/tcp_fec.h b/include/net/tcp_fec.hnew file mode 100644index 0000000..38f2c40--- /dev/null+++ b/include/net/tcp_fec.h@@ -0,0 +1,106 @@+#ifndef _TCP_FEC_H+#define _TCP_FEC_H++#include <net/tcp.h>+#include <asm/unaligned.h>++/* FEC-encoding types (8 bits, internal) */+#define TCP_FEC_TYPE_NONE		0 /* FEC disabled */+#define TCP_FEC_TYPE_XOR_ALL		1 /* XOR every MSS length segment */+#define TCP_FEC_TYPE_XOR_SKIP_1		2 /* XOR every other MSS length+					   * segment */++#define TCP_FEC_NUM_TYPES		3++/* Delay transmission of FEC packets (delay defined in tcp_fec_arm_timer()) */+#define TCP_FEC_DELAYED_SEND		1++/*+ * Returns true if FEC is enabled for the socket+ */+static inline bool tcp_fec_is_enabled(const struct tcp_sock *tp)+{+	return unlikely(tp->fec.type > 0);+}++/*+ * Returns true if the current packet in the buffer is FEC-encoded+ */+static inline bool tcp_fec_is_encoded(const struct tcp_sock *tp)+{+	return unlikely((tp->rx_opt.fec.flags & TCP_FEC_ENCODED) &&+			(tp->rx_opt.fec.saw_fec));+}++/*+ * Decodes FEC parameters and stores them in the FEC struct+ * @seq - sequence number of the packet+ * @ack_seq - ACKed sequence number+ * @is_syn - true, if option was attached to a packet with a SYN flag+ * @ptr - points to the first byte of the FEC option after kind, length,+ *	  and possible magic bytes+ * @len - option length (without kind, length, magic bytes)+ */+int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,+			bool is_syn, const unsigned char *ptr,+			unsigned int len);++/*+ * Encodes FEC parameters to wire format+ * Pointer points to the first byte of the FEC option after kind, length,+ * and possible magic bytes (pointer will be moved to first unoccupied byte)+ */+int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,+			__be32 **ptr);++/*+ * Processes the current packet in the buffer (treated as FEC packet)+ */+int tcp_fec_process(struct sock *sk, struct sk_buff *skb);++/*+ * Checks the received options for loss indicators and acts upon them.+ * In particular, the function handles window reduction requests and processes+ * tail loss indicators.+ * Returns: 1, if window is reduced - 0, otherwise+ */+int tcp_fec_check_ack(struct sock *sk, u32 ack_seq);++/*+ * Since data in the socket's receive queue can get consumed by other parties+ * we need to keep extra references these SKBs until they are no longer+ * required for possible future recoveries.+ * @skb - buffer which is moved to the receive queue+ */+int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb);++/*+ * Disables FEC for this connection (includes clearing references+ * to buffers in receive queue)+ */+void tcp_fec_disable(struct sock *sk);++/* Arms the timer for a delayed FEC transmission if there is+ * no earlier timeout defined (i.e. retransmission timeout)+ */+void tcp_fec_arm_timer(struct sock *sk);++/* The FEC timer fired. Force an FEC transmission for the+ * last unencoded burst. Rearm the RTO timer (which was switched+ * out when setting the FEC timer). Set a new FEC timer if there+ * is pending unencoded data.+ */+void tcp_fec_timer(struct sock *sk);++/* If FEC packets transmissions are delayed set a timer+ * (if not already set), otherwise invoke the FEC mechanism+ * immediately+ */+int tcp_fec_invoke(struct sock *sk);++/* Invoke the FEC mechanism set for the connection;+ * Create and sends out FEC packets+ */+int tcp_fec_invoke_nodelay(struct sock *sk);++#endifdiff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.hindex 3b97183..d679733 100644--- a/include/uapi/linux/tcp.h+++ b/include/uapi/linux/tcp.h@@ -113,6 +113,11 @@ enum { #define TCP_TIMESTAMP		24 #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */ +/*+ *      !!! TCP FEC patch !!!+ */+#define TCP_FEC                        79      /* Forward error correction */+ struct tcp_repair_opt { 	__u32	opt_code; 	__u32	opt_val;diff --git a/net/ipv4/Makefile b/net/ipv4/Makefileindex 518c04e..6aa32ca 100644--- a/net/ipv4/Makefile+++ b/net/ipv4/Makefile@@ -6,7 +6,7 @@ obj-y     := route.o inetpeer.o protocol.o \ 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 	     ip_output.o ip_sockglue.o inet_hashtables.o \ 	     inet_timewait_sock.o inet_connection_sock.o \-	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \+	     tcp.o tcp_fec.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \ 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.cindex 0d438fb..9cfa3d3 100644--- a/net/ipv4/inet_diag.c+++ b/net/ipv4/inet_diag.c@@ -183,7 +183,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,  	if (icsk->icsk_pending == ICSK_TIME_RETRANS || 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||+	    icsk->icsk_pending == ICSK_TIME_FEC) { 		r->idiag_timer = 1; 		r->idiag_retrans = icsk->icsk_retransmits; 		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.cindex eeb17b3..9c58530 100644--- a/net/ipv4/sysctl_net_ipv4.c+++ b/net/ipv4/sysctl_net_ipv4.c@@ -28,6 +28,7 @@  static int zero; static int one = 1;+static int two = 2; static int four = 4; static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255;@@ -810,6 +811,15 @@ static struct ctl_table ipv4_table[] = { 		.proc_handler	= proc_dointvec_minmax, 		.extra1		= &one 	},+	{+		.procname	= "tcp_fec",+		.data		= &sysctl_tcp_fec,+		.maxlen		= sizeof(int),+		.mode		= 0644,+		.proc_handler	= proc_dointvec,+		.extra1		= &zero,+		.extra2		= &two,+	}, 	{ } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.cindex b8ff562..1a2dab5 100644--- a/net/ipv4/tcp.c+++ b/net/ipv4/tcp.c@@ -276,6 +276,8 @@ #include <net/ip.h> #include <net/sock.h> +#include <net/tcp_fec.h>+ #include <asm/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h>@@ -2565,6 +2567,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, 		else 			tp->tsoffset = val - tcp_time_stamp; 		break;+	case TCP_FEC:+		if (sysctl_tcp_fec && val >= 0 && val < TCP_FEC_NUM_TYPES)+			tp->fec.type = val;+		else+			err = -EINVAL;+		break; 	case TCP_NOTSENT_LOWAT: 		tp->notsent_lowat = val; 		sk->sk_write_space(sk);@@ -2792,6 +2800,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, 	case TCP_TIMESTAMP: 		val = tcp_time_stamp + tp->tsoffset; 		break;+	case TCP_FEC:+		val = tp->fec.type;+		break; 	case TCP_NOTSENT_LOWAT: 		val = tp->notsent_lowat; 		break;diff --git a/net/ipv4/tcp_fec.c b/net/ipv4/tcp_fec.cnew file mode 100644index 0000000..53711cb--- /dev/null+++ b/net/ipv4/tcp_fec.c@@ -0,0 +1,1253 @@+#include <net/tcp_fec.h>++/* Codes for incoming FEC packet processing */+#define FEC_NO_LOSS		1+#define FEC_LOSS_UNRECOVERED	2+#define FEC_LOSS_RECOVERED	3++/* Receiver routines */+static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,+			unsigned int block_skip);+static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,+			unsigned char *data, u32 seq, int len);+static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,+			int recovery_status);+static void tcp_fec_reduce_window(struct sock *sk);+static void tcp_fec_mark_skbs_lost(struct sock *sk);+static bool tcp_fec_update_decoded_option(struct sk_buff *skb);+static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,+			const struct sk_buff *skb, unsigned char *dec_data,+			u32 seq, unsigned int len);++/* Sender routines */+static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list);+static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,+			unsigned int first_seq, unsigned int block_len,+			unsigned int block_skip,+			unsigned int max_encoded_per_pkt);+static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,+			struct tcp_fec *fec, unsigned char *enc_data,+			u32 seq);+static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list);+static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb);++/* Buffer access routine */+static unsigned int tcp_fec_get_next_block(struct sock *sk,+			struct sk_buff **skb, struct sk_buff_head *queue,+			u32 seq, unsigned int block_len,+			unsigned char *block);++/* Have to define this signature here since the actual function was static+ * and tcp_output.c has no corresponding header file+ */+extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,+			gfp_t gfp_mask);++/* Decodes FEC parameters and stores them in the FEC struct+ * @seq - sequence number of the packet+ * @ack_seq - ACKed sequence number+ * @is_syn - true, if option was attached to a packet with a SYN flag+ * @ptr - points to the first byte of the FEC option after kind, length,+ *	  and possible magic bytes+ * @len - option length (without kind, length, magic bytes)+ */+int  tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,+				bool is_syn, const unsigned char *ptr,+				unsigned int len)+{+	/* reset / initialize option values which should be evaluated+	 * with EVERY incoming packet+	 */+	fec->flags = 0;+	fec->saw_fec = 1;++	if (len == 1) {+		/* Short option */+		u8 val = *((u8 *) ptr);+		if (is_syn) {+			/* Negotiation */+			fec->type = val;+		} else {+			/* Regular packet */+			fec->flags = val;+		}++		return 0;+	}++	if (len == 4) {+		/* Long option */+		u32 val = get_unaligned_be32(ptr);+		fec->flags = val >> 24;++		if (fec->flags & TCP_FEC_ENCODED) {+			fec->enc_seq = seq;+			fec->enc_len = val & 0xFFFFFF;+		} else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {+			fec->lost_seq = ack_seq;+			fec->lost_len = val & 0xFFFFFF;+		} else {+			return -EINVAL;+		}++		return 0;+	}++	/* Invalid option length */+	return -EINVAL;+}++/* Encodes FEC parameters to wire format+ * @ptr - Encoded option is written to this memory location (and the pointer+ *        is advanced to the next unoccupied byte, 4-byte aligned)+ * Returns the length of the encoded option (including alignment)+ */+int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,+			__be32 **ptr)+{+	int len;++	fec->flags |= tp->fec.flags;+	fec->lost_len = tp->fec.lost_len;+	tp->fec.flags &= ~TCP_FEC_RECOVERY_CWR;+	tp->fec.flags &= ~TCP_FEC_RECOVERY_FAILED;++	/* Encode fixed option part (option kind, length, and magic bytes) */+	if (fec->flags & (TCP_FEC_ENCODED | TCP_FEC_RECOVERY_FAILED))+		len = 4 + TCPOLEN_EXP_FEC_BASE; /* Long option */+	else+		len = 1 + TCPOLEN_EXP_FEC_BASE; /* Short option */++	**ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FEC_MAGIC);+	(*ptr)++;++	if ((fec->flags & TCP_FEC_ENCODED) &&+	    (fec->flags & TCP_FEC_RECOVERY_FAILED)) {+		/* TODO Special case: need to separate loss indication+		 * from encoding or make option 12 bytes long+		 * This can only happen if a node receives and sends FEC+		 * data+		 */+		fec->flags &= ~TCP_FEC_RECOVERY_FAILED;+	}++	if (fec->flags & TCP_FEC_ENCODED) {+		/* FEC-encoded packets carry:+		 * <Flags:8, Encoding length:24>+		 */+		**ptr = htonl((fec->flags << 24) |+			      (fec->enc_len));+		(*ptr)++;+		return 8;+	} else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {+		/* Packets with failed recovery indication carry:+		 * <Flags:8, Bytes after ACKed seq lost:24>+		 */+		**ptr = htonl((fec->flags << 24) |+			      (fec->lost_len));+		(*ptr)++;+		return 8;+	} else if (fec->type) {+		/* Negotiation packets carry: <Encoding type:8> */+		**ptr = htonl((fec->type << 24) |+			      (TCPOPT_NOP << 16) |+			      (TCPOPT_NOP << 8) |+			      TCPOPT_NOP);+		(*ptr)++;+		return 8;+	} else {+		/* All other packets carry: <Flags:8> */+		**ptr = htonl((fec->flags << 24) |+			      (TCPOPT_NOP << 16) |+			      (TCPOPT_NOP << 8) |+			      TCPOPT_NOP);+		(*ptr)++;+		return 8;+	}+}++/* Processes the current packet in the buffer, treated as an FEC packet+ * (assumes that options were already processed)+ */+int tcp_fec_process(struct sock *sk, struct sk_buff *skb)+{+	struct tcp_sock *tp;+	struct tcphdr *th;+	int recovery_status, err;+	u32 end_seq;++	tp = tcp_sk(sk);+	th = tcp_hdr(skb);+	recovery_status = 0;++	/* drop packet if packet is not encoded */+	if (!(tp->rx_opt.fec.flags & TCP_FEC_ENCODED))+		return -1;++	/* check if all encoded packets were already received */+	end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;+	if (!after(end_seq, tp->rcv_nxt)) {+		tcp_fec_send_ack(sk, skb, FEC_NO_LOSS);+		return 0;+	}++	/* linearize the SKB (for easier payload access) */+	err = skb_linearize(skb);+	if (err)+		return err;++	/* data recovery */+	switch (tp->fec.type) {+	case TCP_FEC_TYPE_NONE:+		return -1;+	case TCP_FEC_TYPE_XOR_ALL:+		recovery_status = tcp_fec_process_xor(sk, skb, 0);+		break;+	case TCP_FEC_TYPE_XOR_SKIP_1:+		recovery_status = tcp_fec_process_xor(sk, skb, 1);+		break;+	}++	/* TODO error handling; -ENOMEM, etc. - disable FEC? */+	if (recovery_status < 0)+		return recovery_status;++	/* Send an explicit ACK if recovery failed */+	if (recovery_status == FEC_LOSS_UNRECOVERED)+		tcp_fec_send_ack(sk, skb, recovery_status);++	return 0;+}++/* Checks the received options for loss indicators and acts upon them.+ * In particular, the function handles recovery flags (indicators for+ * successful and failed recoveries, tail losses)+ * Returns: 1, if ACK contains a loss indicator+ */+int tcp_fec_check_ack(struct sock *sk, u32 ack_seq)+{+	struct tcp_sock *tp;++	tp = tcp_sk(sk);++	/* Clear local recovery indication (and ECN CWR demand)+	 * if it was ACKED by the other node+	 */+	if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_CWR) {+		tp->fec.flags &= ~TCP_FEC_RECOVERY_SUCCESSFUL;+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;+	}++	/* Check for tail loss indicators+	 * This happens when FEC was unable to recover the lost data and+	 * thus only sends an ACK with the loss range back. Everything not+	 * ACKed/SACKed now, is considered lost now.+	 */+	if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_FAILED) {+		tcp_fec_mark_skbs_lost(sk);+		return 1;+	}++	/* Check if the remote endpoint successfully recovered data,+	 * if so we trigger a window reduction+	 */+	if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_SUCCESSFUL) {+		/* Ignore flag if window was already reduced for the current+		 * loss episode or if previous reduction was not signaled+		 * yet (no outgoing packets)+		 */+		if (after(ack_seq, tp->high_seq) &&+				!(tp->fec.flags & TCP_FEC_RECOVERY_CWR)) {+			tcp_fec_reduce_window(sk);+			tp->fec.flags |= TCP_FEC_RECOVERY_CWR;+		}++		return 1;+	}++	return 0;+}++/* Since data in the socket's receive queue can get consumed by other parties+ * we need to clone these SKBs until they are no longer required for possible+ * future recoveries. This function is called after the TCP header has been+ * removed from the SKB already. All parameters required for recovery are+ * stored in the SKB's control buffer.+ * @skb - buffer which is moved to the receive queue+ */+int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb)+{+	struct tcp_sock *tp;+	struct sk_buff *cskb;+	u32 data_len;+	int extra_bytes, err;+	tp = tcp_sk(sk);++	/* clone the SKB and add it to the FEC receive queue+	 * (a simple extra reference to the SKB is not sufficient since+	 * since SKBs can only be queued on one list at a time)+	 */+	cskb = skb_clone(skb, GFP_ATOMIC);+	if (cskb == NULL)+		return -ENOMEM;++	/* linearize the SKB (for easier payload access) */+	err = skb_linearize(cskb);+	if (err)+		return err;++	data_len = skb->len;+	if (!data_len) {+		kfree_skb(cskb);+		return 0;+	}++	skb_queue_tail(&tp->fec.rcv_queue, cskb);+	tp->fec.bytes_rcv_queue += data_len;++	/* check if we can dereference old SKBs (as long as we have enough+	 * data for future recoveries)+	 */+	extra_bytes = tp->fec.bytes_rcv_queue - FEC_RCV_QUEUE_LIMIT;+	while (extra_bytes > 0) {+		cskb = skb_peek(&tp->fec.rcv_queue);+		if (cskb == NULL)+			return -EINVAL;++		data_len = TCP_SKB_CB(cskb)->end_seq - TCP_SKB_CB(cskb)->seq;+		if (data_len > extra_bytes) {+			break;+		} else {+			extra_bytes -= data_len;+			tp->fec.bytes_rcv_queue -= data_len;+			skb_unlink(cskb, &tp->fec.rcv_queue);+			kfree_skb(cskb);+		}+	}++	return 0;+}++/* Disables FEC for this connection (includes clearing references+ * to buffers in receive queue)+ */+void tcp_fec_disable(struct sock *sk)+{+	struct tcp_sock *tp = tcp_sk(sk);++	if (!tcp_fec_is_enabled(tp))+		return;++	tp->fec.type = 0;+	tp->fec.bytes_rcv_queue = 0;+	skb_queue_purge(&tp->fec.rcv_queue);+}++/* Processes the current packet in the buffer, treated as an FEC packet+ * with XOR-encoded payload (assumes that options were already processed)+ * Returns: negative code, if an error occurred;+ *	positive code, otherwise (recovery status)+ * @block_skip - Number of unencoded blocks between two encoded blocks+ */+static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,+			unsigned int block_skip)+{+	struct sk_buff *pskb;+	struct tcp_sock *tp;+	struct tcphdr *th;+	u32 next_seq, end_seq, rec_seq;+	unsigned char *data, *block;+	unsigned int i, offset, data_len, block_len, rec_len;+	bool seen_loss;+	int ret;++	pskb = NULL;+	tp = tcp_sk(sk);+	th = tcp_hdr(skb);+	next_seq = tp->rx_opt.fec.enc_seq;+	end_seq = next_seq + tp->rx_opt.fec.enc_len;+	block_len = skb->len - tcp_hdrlen(skb);+	seen_loss = false;+	offset = 0;++	/* memory allocation for decoding / recovered SKB data */+	data = kmalloc(2 * block_len, GFP_ATOMIC);+	if (data == NULL)+		return -ENOMEM;++	block = data + block_len;++	/* copy FEC payload (skip TCP header) */+	memcpy(data, skb->data + tcp_hdrlen(skb), block_len);++	/* process in-sequence data */+	while ((data_len = tcp_fec_get_next_block(sk, &pskb,+				&tp->fec.rcv_queue, next_seq,+				min(block_len, end_seq - next_seq),+				block))) {+		next_seq += data_len;++		/* XOR with existing payload */+		for (i = 0; i < data_len; i++)+			data[i] ^= block[i];++		/* we could no read a whole MSS block, which means we+		 * reached the end of the queue or end of range which the+		 * FEC packet covers+		 */+		if (data_len < block_len)+			break;++		/* skip unencoded blocks if there is more data encoded */+		if (end_seq - next_seq > 0)+			next_seq += block_len * block_skip;+	}++	/* check if all encoded bytes were already received */+	if (next_seq == end_seq) {+		kfree(data);+		return FEC_NO_LOSS;+	}++	/* we always recover one whole MSS block (otherwise slicing+	 * would introduce a lot of additional complexity here) and handle+	 * cut out already received sequences later+	 */+	rec_seq = next_seq;+	rec_len = min(block_len, end_seq - rec_seq);+	offset  = data_len;+	if ((rec_seq + rec_len) == end_seq)+		goto recover;++	next_seq += block_len * (block_skip + 1);+	pskb = NULL;++	/* read a possibly partial (smaller than MSS) block to fill up the+	 * previously unfilled block and achieve alignment again+	 */+	data_len = tcp_fec_get_next_block(sk, &pskb, &tp->out_of_order_queue,+				next_seq, block_len - offset, block);++	next_seq += data_len;++	/* check if we could not read as much data as requested */+	if ((next_seq != end_seq) && (data_len < (block_len - offset)))+		goto clean;++	/* XOR with existing payload */+	for (i = 0; i < data_len; i++)+		data[i+offset] ^= block[i];++	/* skip unencoded blocks if there is more data encoded */+	if (end_seq - next_seq > 0)+		next_seq += block_len * block_skip;++	/* read all necessary blocks to finish decoding */+	while ((data_len = tcp_fec_get_next_block(sk, &pskb,+				&tp->out_of_order_queue, next_seq,+				min(block_len, end_seq - next_seq),+				block))) {+		next_seq += data_len;++		/* XOR with existing payload */+		for (i = 0; i < data_len; i++)+			data[i] ^= block[i];++		/* we could not read a whole MSS block, which means we reached+		 * the end of the queue or end of range which the FEC packet+		 * covers+		 */+		if (data_len < block_len)+			break;++		/* skip unencoded blocks if there is more data encoded */+		if (end_seq - next_seq > 0)+			next_seq += block_len * block_skip;+	}++	/* check if additional losses were observed (cannot recover) */+	if (next_seq != end_seq)+		goto clean;++recover:+	/* create and process recovered packets */+	for (i = 0; i < rec_len; i++)+		block[i] = data[(offset + i) % block_len];++	if (block_skip && ((block_len - offset) < rec_len)) {+		/* recover non-consecutive sequence ranges (only when+		 * slicing is used)+		 */+		u32 second_seq;+		unsigned int second_seq_len, first_seq_len;++		first_seq_len = block_len - offset;+		second_seq = rec_seq + first_seq_len + block_len * block_skip;+		second_seq_len = rec_len - first_seq_len;++		ret = tcp_fec_recover(sk, skb, block, rec_seq, first_seq_len);+		if (ret >= 0) {+			int second_ret = tcp_fec_recover(sk, skb,+						block + first_seq_len,+						second_seq, second_seq_len);+			if (second_ret < 0 || !ret)+				ret = second_ret;+		}+	} else {+		ret = tcp_fec_recover(sk, skb, block, rec_seq, rec_len);+	}++	kfree(data);+	return ret ? ret : FEC_LOSS_RECOVERED;++clean:+        kfree(data);+        return FEC_LOSS_UNRECOVERED;+}++/* Create a recovered packet and forward it to the reception routine */+static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,+		unsigned char *data, u32 seq, int len)+{+	struct sk_buff *rskb;+	struct tcp_sock *tp;++	tp = tcp_sk(sk);++	/* We will notify the remote node that recovery was successful */+	tp->fec.flags |= TCP_FEC_RECOVERY_SUCCESSFUL;++	/* Check if we received some tail of the recovered sequence already+	 * by looking at the current SACK blocks (we don't want to recover+	 * more data than necessary to prevent DSACKS)+	 */+	if (tcp_is_sack(tp)) {+		int i;+		for (i = 0; i < tp->rx_opt.num_sacks; i++) {+			if (before(tp->selective_acks[i].start_seq,+				   seq + len) &&+			   !before(tp->selective_acks[i].end_seq,+				   seq + len)) {+				len = tp->selective_acks[i].start_seq - seq;+				break;+			}+		}+	}++	/* We might have prematurely asked for a recovery in the case where the+	 * whole recovery sequence is already covered by SACKs+	 */+	if (len <= 0)+		return FEC_NO_LOSS;++	/* Create decoded packet and forward to reception routine */+	rskb = tcp_fec_make_decoded_pkt(sk, skb, data, seq, len);+	if (rskb == NULL)+		return -EINVAL;++	tcp_rcv_established(sk, rskb, tcp_hdr(rskb), rskb->len);+	return 0;+}++/* Sends an ACK for the FEC packet and encodes any congestion or+ * and/or recovery information+ */+static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,+				int recovery_status)+{+	struct tcp_sock *tp;+	u32 end_seq;++	tp = tcp_sk(sk);++	/* Right now we only need an outgoing ACK if FEC recovery failed,+	 * in all other cases ACKs are implicitly generated+	 */+	switch (recovery_status) {+	case FEC_LOSS_UNRECOVERED:+		end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;+		tp->fec.flags |= TCP_FEC_RECOVERY_FAILED;+		tp->fec.lost_len = end_seq - tp->rcv_nxt;+		tcp_send_ack(sk);+		break;+	}+}++/* Reduces the congestion window (similar to completed fast recovery)+ * If the node is already in recovery mode, undo is disabled to enforce+ * the window reduction upon completion+ */+static void tcp_fec_reduce_window(struct sock *sk)+{+	struct tcp_sock *tp;+	const struct inet_connection_sock *icsk;++	tp = tcp_sk(sk);+	icsk = inet_csk(sk);++	if (icsk->icsk_ca_state < TCP_CA_CWR) {+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);+		if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {+			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);+			tp->snd_cwnd_stamp = tcp_time_stamp;+		}++		/* Any future window reduction requests are ignored until+		 * snd_nxt is ACKed+		 */+		tp->high_seq = tp->snd_nxt;+		tp->undo_marker = 0;+	} else {+		/* Socket is in some congestion mode and we only need to make+		 * sure that window reduction is executed when recovery+		 * is finished+		 */+		tp->undo_marker = 0;+	}+}++/* The incoming ACK indicates a failed recovery.+ * Mark all unacked SKBs in the loss range as lost.+ * TODO With interleaved coding, we have the additional constraint+ * that the SKBs in the loss range also must have been encoded the+ * triggering FEC packet, and for that we need to keep some info+ * about FEC packets on the sender side+ */+static void tcp_fec_mark_skbs_lost(struct sock *sk)+{+	struct tcp_sock *tp;+	struct sk_buff *skb;+	u32 start_seq, end_seq;++	tp = tcp_sk(sk);+	skb = tp->lost_skb_hint ? tp->lost_skb_hint : tcp_write_queue_head(sk);++	/* All SKBs falling completely in the range are marked */+	start_seq = tp->rx_opt.fec.lost_seq;+	end_seq = tp->rx_opt.fec.lost_seq + tp->rx_opt.fec.lost_len;++	tcp_for_write_queue_from(skb, sk) {+		if (skb == tcp_send_head(sk))+			break;++		/* Past loss range */+		if (!before(TCP_SKB_CB(skb)->seq, end_seq))+			break;++		/* SKB not (fully) within range */+		if (before(TCP_SKB_CB(skb)->seq, start_seq) ||+		    after(TCP_SKB_CB(skb)->end_seq, end_seq))+			continue;++		/* SKB already marked */+		if (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))+			continue;++		/* Verify retransmit hint before marking+		 * (see tcp_verify_retransmit_hint(),+		 * copied since method defined static in tcp_input.c)+		 */+		if ((tp->retransmit_skb_hint == NULL) ||+		    before(TCP_SKB_CB(skb)->seq,+			   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))+			tp->retransmit_skb_hint = skb;++		if (!tp->lost_out ||+		    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;++		/* Mark SKB as lost (see tcp_skb_mark_lost()) */+		tp->lost_out += tcp_skb_pcount(skb);+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;+	}++	tcp_verify_left_out(tp);+}++/* Searches for the FEC option in the packet header and replaces+ * the long option with a short one padded by NOPs.+ * This is done to convert the option used by an encoded packet+ * to the option used by a recovered packet.+ */+static bool tcp_fec_update_decoded_option(struct sk_buff *skb)+{+	struct tcphdr *th;+	unsigned char *ptr;+	int length;++	th = tcp_hdr(skb);+	ptr = (unsigned char *) (th + 1);+	length = (th->doff * 4) - sizeof(struct tcphdr);++	while (length > 0) {+		int opcode = *ptr++;+		int opsize;++		switch (opcode) {+		case TCPOPT_EOL:+			return 0;+		case TCPOPT_NOP:+			length--;+			continue;+		default:+			opsize = *ptr++;+			if (opsize < 2 || opsize > length)+				return 0;++			if (opcode == TCPOPT_EXP &&+				get_unaligned_be16(ptr) == TCPOPT_FEC_MAGIC) {+				/* Update FEC option:+				 * 1. Convert long option into short option+				 * 2. Clear ENCODED flag (keep other flags)+				 * 3. Replace option value (long option) by NOPs+				 */+				u32 *fec_opt_start = (u32 *) (ptr - 2);+				*fec_opt_start = htonl((+					get_unaligned_be32(fec_opt_start) &+					0xFF00FFFF) | 0x00050000);+				*(fec_opt_start + 1) = htonl((+					get_unaligned_be32(fec_opt_start + 1) &+					0xEF000000) | 0x00010101);++				return 1;+			}++			ptr += opsize - 2;+			length -= opsize;+		}+	}++	return 0;+}++/* Allocates an SKB for data we want to forward to reception routines+ * (recovered data) by making a copy of the FEC SKB and replacing the data+ * part, all other segments (options, etc.) are preserved+ */+static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,+				const struct sk_buff *skb,+				unsigned char *dec_data,+				u32 seq, unsigned int len)+{+	struct tcp_sock *tp;+	struct sk_buff *nskb;++	tp = tcp_sk(sk);+	nskb = skb_copy(skb, GFP_ATOMIC);+	if (nskb == NULL)+		return NULL;++	/* Update FEC option for the new packet */+	if (!tcp_fec_update_decoded_option(nskb)) {+		/* TODO Do we need this catch? Technically we don't reach this+		 * method if there is no FEC option in the header.+		 */+		return NULL;+	}++	/* check if we received some tail of the recovered sequence already+	 * by looking at the current SACK blocks (we don't want to recover+	 * more data than necessary to prevent DSACKS)+	 */+	if (tcp_is_sack(tp)) {+		int i;+		for (i = 0; i < tp->rx_opt.num_sacks; i++) {+			if (before(tp->selective_acks[i].start_seq,+				   seq + len) &&+				   !before(tp->selective_acks[i].end_seq,+				   seq + len)) {+				len = tp->selective_acks[i].start_seq - seq;+				break;+			}+		}+	}++	/* trim data section to fit recovered sequence if necessary */+	if (len < (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq))+		skb_trim(nskb, len + tcp_hdrlen(nskb));++	/* fix the sequence numbers */+	tcp_hdr(nskb)->seq = htonl(seq);+	tcp_hdr(nskb)->ack_seq = htonl(tp->snd_una);+	TCP_SKB_CB(nskb)->seq = seq;+	TCP_SKB_CB(nskb)->end_seq = seq + len;++	/* replace SKB payload with recovered data */+	memcpy(nskb->data + tcp_hdrlen(nskb), dec_data, len);++	/* packets used for recovery had their checksums checked already */+	nskb->ip_summed = CHECKSUM_UNNECESSARY;++	return nskb;+}++/* Gets the next byte block from an SKB queue (any SKB which is touched+ * in this procedure will be linearized to simplify payload access)+ * @skb - Points to SKB from which previous block was extracted (useful+ *	  for successive calls to this function, which avoids moving through+ *	  the whole queue again)+ * @queue - SKB queue to read from (SKB has to point to an element on this+ *	  queue)+ * @seq - Sequence number of first byte in the block+ * @block_len+ * @block+ *+ * Returns the bytes written to the block memory+ */+static unsigned int tcp_fec_get_next_block(struct sock *sk,+				struct sk_buff **skb,+				struct sk_buff_head *queue, u32 seq,+				unsigned int block_len, unsigned char *block)+{+	unsigned int cur_len, offset, num_bytes;+	int err;+	u32 end_seq;++	cur_len = 0;++	/* Get first SKB of the write queue and specify next sequence to+	 * encode+	 */+	if (*skb == NULL) {+		*skb = skb_peek(queue);+		if (*skb == NULL)+			return 0;+	}++	/* move to SKB which stores the next sequence to encode */+	while (*skb) {+		/* If we observe an RST/SYN, we stop here to avoid+		 * handling corner cases+		 */+		if (TCP_SKB_CB(*skb)->tcp_flags &+					(TCPHDR_RST |+					 TCPHDR_SYN))+			return 0;+		if (!before(seq, TCP_SKB_CB(*skb)->seq) &&+					before(seq, TCP_SKB_CB(*skb)->end_seq))+			break;+		if (*skb == skb_peek_tail(queue)) {+			*skb = NULL;+			break;+		}++		*skb = skb_queue_next(queue, *skb);+	}++	if (*skb == NULL)+		return 0;++	/* copy bytes from SKBs (connected sequences) */+	while (*skb && (cur_len < block_len)) {+		err = skb_linearize(*skb);+		if (err)+			return err;++		/* Deal with the end seq number being incremented by+		 * one if the FIN flag is set (we don't want to encode this)+		 */+		end_seq = TCP_SKB_CB(*skb)->end_seq;+		if (TCP_SKB_CB(*skb)->tcp_flags & TCPHDR_FIN)+			end_seq--;++		if ((seq >= TCP_SKB_CB(*skb)->seq) && (seq < end_seq)) {+			/* Copy data depending on:+			 * - remaining space in the block+			 * - remaining data in the SKB+			 */+			offset = seq - TCP_SKB_CB(*skb)->seq;+			num_bytes = min(block_len - cur_len,+					end_seq - seq);++			memcpy(block + cur_len, (*skb)->data + offset,+			       num_bytes);+			cur_len += num_bytes;+			seq += num_bytes;+		}++		if (*skb == skb_peek_tail(queue) || cur_len >= block_len)+			break;++		*skb = skb_queue_next(queue, *skb);+	}++	return cur_len;+}++/* Arms the timer for a delayed FEC transmission if there is+ * no earlier timeout defined (i.e. retransmission timeout)+ */+void tcp_fec_arm_timer(struct sock *sk)+{+	struct inet_connection_sock *icsk;+	struct tcp_sock *tp;+	u32 delta, timeout, rtt;++	icsk = inet_csk(sk);+	tp = tcp_sk(sk);++	/* Only arm a timer if connection is established */+	if (sk->sk_state != TCP_ESTABLISHED)+		return;++	/* Forward next sequence to be encoded if unencoded data was acked */+	if (after(tp->snd_una, tp->fec.next_seq))+		tp->fec.next_seq = tp->snd_una;++	/* Don't arm the timer if there is no unencoded data left */+	if (!before(tp->fec.next_seq, tp->snd_nxt))+		return;++	/* TODO handle other timers which might be armed;+	 * EARLY_RETRANS? LOSS_PROBE?+	 */++	/* Compute timeout (currently 0.25 * RTT) */+	rtt = tp->srtt_us >> 3;+	timeout = rtt >> 2;++	/* Compute delay between transmission of original packet and this call+	 * (difference is subtracted from timeout value)+	 */+	delta = 0;+	if (delta > timeout) {+		tcp_fec_invoke_nodelay(sk);+		return;+	} else if (delta > 0) {+		timeout -= delta;+	}++	/* Do not replace a timeout occurring earlier */+	if (jiffies + timeout >= icsk->icsk_timeout)+		return;++	inet_csk_reset_xmit_timer(sk, ICSK_TIME_FEC, timeout, TCP_RTO_MAX);+}++/* The FEC timer fired. Force an FEC transmission for the+ * last unencoded burst. Rearm the RTO timer (which was switched+ * out when setting the FEC timer). Set a new FEC timer if there+ * is pending unencoded data.+ */+void tcp_fec_timer(struct sock *sk)+{+	struct inet_connection_sock *icsk;+	struct tcp_sock *tp;++	icsk = inet_csk(sk);+	tp = tcp_sk(sk);++	tcp_fec_invoke_nodelay(sk);++	icsk->icsk_pending = 0;+	tcp_rearm_rto(sk);++	tcp_fec_arm_timer(sk);+}++/* If FEC packet transmissions are delayed set a timer+ * (if not already set), otherwise invoke the FEC mechanism+ * immediately+ */+int tcp_fec_invoke(struct sock *sk)+{+	struct inet_connection_sock *icsk;+	struct tcp_sock *tp;++	icsk = inet_csk(sk);+	tp = tcp_sk(sk);++#ifndef TCP_FEC_DELAYED_SEND+	return tcp_fec_invoke_nodelay(sk);+#else+	/* Set the timer for sending an FEC packet if no FEC+	 * timer is active yet+	 */+	if (!icsk->icsk_pending || icsk->icsk_pending != ICSK_TIME_FEC)+		tcp_fec_arm_timer(sk);+#endif++	return 0;+}++/* Invokes the FEC mechanism set for the connection;+ * Creates and sends out FEC packets+ */+int tcp_fec_invoke_nodelay(struct sock *sk)+{+	int err;+	struct sk_buff_head *list;+	struct sk_buff *skb;+	struct tcp_fec *fec;++	list = kmalloc(sizeof(struct sk_buff_head), GFP_ATOMIC);+	if (list == NULL)+		return -ENOMEM;++	skb_queue_head_init(list);+	err = tcp_fec_create(sk, list);+	if (err)+		goto clean;++	err = tcp_fec_xmit_all(sk, list);+	if (err)+		goto clean;++clean:+	/* Purge all SKBs (purge FEC structs first) */+	skb = (struct sk_buff *) list;+	while (!skb_queue_is_last(list, skb)) {+		skb = skb_queue_next(list, skb);+		fec = TCP_SKB_CB(skb)->fec;+		if (fec != NULL) {+			kfree(fec);+			TCP_SKB_CB(skb)->fec = NULL;+		}+	}++	skb_queue_purge(list);+	kfree(list);++	/* TODO error handling; -ENOMEM, etc. - disable FEC? */++	return err;+}++/* Creates one or more FEC packets (can depend on the FEC type used)+ * and puts them in a queue+ * @list: queue head+ */+static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list)+{+	struct tcp_sock *tp;+	unsigned int first_seq, block_len;+	int err;++	tp = tcp_sk(sk);++	/* Update the pointer to the first byte to be encoded next+	 * (this only matters when a packet was ACKed before it was+	 * encoded)+	 */+	if (after(tp->snd_una, tp->fec.next_seq))+		tp->fec.next_seq = tp->snd_una;++	first_seq = tp->fec.next_seq;+	block_len = tcp_current_mss(sk);++	switch (tp->fec.type) {+	case TCP_FEC_TYPE_NONE:+		return 0;+	case TCP_FEC_TYPE_XOR_ALL:+		return tcp_fec_create_xor(sk, list, first_seq,+					  block_len, 0,+					  FEC_RCV_QUEUE_LIMIT - block_len);+	case TCP_FEC_TYPE_XOR_SKIP_1:+		err = tcp_fec_create_xor(sk, list, first_seq, block_len, 1,+					  FEC_RCV_QUEUE_LIMIT - block_len);+		if (err)+			return err;++		return tcp_fec_create_xor(sk, list, first_seq + block_len,+					  block_len, 1,+					  FEC_RCV_QUEUE_LIMIT - block_len);+	}++	return 0;+}++/* Creates FEC packet(s) using XOR encoding+ * (allocates memory for the FEC structs)+ * @first_seq - Sequence number of first byte to be encoded+ * @block_len - Block length (typically MSS)+ * @block_skip - Number of unencoded blocks between two encoded blocks+ * @max_encoded_per_pkt - maximum number of blocks encoded per packet+ *	(0, if unlimited)+ */+static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,+				unsigned int first_seq, unsigned int block_len,+				unsigned int block_skip,+				unsigned int max_encoded_per_pkt)+{+	struct tcp_sock *tp;+	struct sk_buff *skb, *fskb;+	struct tcp_fec *fec;+	unsigned int c_encoded;		/* Number of currently encoded blocks+					   not yet added to an FEC packet */+        unsigned int next_seq;          /* Next byte to encode */+        unsigned int i;+	unsigned char *data, *block;+	u16 data_len;++	tp = tcp_sk(sk);+	skb = NULL;+	c_encoded = 0;+	next_seq = first_seq;++	/* memory allocation+	 * data - used temporarily to obtain byte blocks and store the payload+		  (is freed before returning; we need two blocks here to store+                   the previously XORed data that has not been added to an FEC+                   packet yet, and the new to-be XORed data extracted from one+                   or more existing buffers)++	 * fec	- used to store the FEC parameters+		  (is freed after the corresponding packet is forwarded to the+		  transmission routine)+	 */+	data = kmalloc(2 * block_len, GFP_ATOMIC);+	if (data == NULL)+		return -ENOMEM;++	fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);+	if (fec == NULL) {+		kfree(data);+		return -ENOMEM;+	}++	memset(data, 0, 2 * block_len);+	memset(fec, 0, sizeof(struct tcp_fec));++	block = data + block_len;++	/* encode data blocks+	 * XXX atomicity check?+	 */+	fec->enc_seq = next_seq;+	while ((data_len = tcp_fec_get_next_block(sk, &skb,+				&sk->sk_write_queue, next_seq,+				min(block_len, tp->snd_nxt - next_seq),+				block))) {+		/* Check if we reached the encoding limit; then create packet+		 * with current payload and add it to the queue+		 */+		if (max_encoded_per_pkt > 0 &&+					c_encoded >= max_encoded_per_pkt) {+			fskb = tcp_fec_make_encoded_pkt(sk, fec, data,+						block_len);+			if (fskb == NULL) {+				kfree(data);+				kfree(fec);+				return -EINVAL;+			}++			skb_queue_tail(list, fskb);+			memset(data, 0, block_len);+			c_encoded = 0;++			/* memory allocation for the FEC struct of the next+			 * packet+			 */+			fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);+			if (fec == NULL) {+				kfree(data);+				return -ENOMEM;+			}++			memset(fec, 0, sizeof(struct tcp_fec));+			fec->enc_seq = next_seq;+		}++		next_seq += data_len;+		fec->enc_len = next_seq - fec->enc_seq;++		/* encode block into existing payload (XOR) */+		for (i = 0; i < data_len; i++)+			data[i] ^= block[i];++		c_encoded++;++		/* skip over blocks which are not requested for encoding */+		next_seq += block_len * block_skip;+	}++	/* create final packet if some data was selected for encoding */+	if (c_encoded > 0) {+		fskb = tcp_fec_make_encoded_pkt(sk, fec, data, block_len);+		if (fskb == NULL) {+			kfree(data);+			kfree(fec);+			return -EINVAL;+		}++		skb_queue_tail(list, fskb);+	} else {+		kfree(fec);+	}++	tp->fec.next_seq = next_seq;+	kfree(data);++	return 0;+}++/* Allocates an SKB for data we want to send and assigns+ * the necessary options and fields+ */+static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,+				struct tcp_fec *fec,+				unsigned char *enc_data,+				unsigned int len)+{+	struct sk_buff *skb;+	unsigned char *data;++	/* See tcp_make_synack(); 15 probably for tail pointer etc.? */+	len = min(len, fec->enc_len);+	skb = alloc_skb(MAX_TCP_HEADER + 15 + len, GFP_ATOMIC);+	if (skb == NULL)+		return NULL;++	/* Reserve space for headers */+	skb_reserve(skb, MAX_TCP_HEADER);++	/* Specify sequence number and FEC struct address in control buffer */+	fec->flags |= TCP_FEC_ENCODED;+	TCP_SKB_CB(skb)->seq = fec->enc_seq;+	TCP_SKB_CB(skb)->fec = fec;++	/* Enable ACK flag (required for all data packets) */+	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_ACK;++	/* Set GSO parameters */+	skb_shinfo(skb)->gso_segs = 1;+	skb_shinfo(skb)->gso_size = 0;+	skb_shinfo(skb)->gso_type = 0;++	/* Append payload to SKB */+	data = skb_put(skb, len);+	memcpy(data, enc_data, len);++	skb->ip_summed = CHECKSUM_PARTIAL;++	return skb;+}++/* Transmit all FEC packets in a list */+static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list)+{+	struct sk_buff *skb;+	int err;++	if (list == NULL || skb_queue_empty(list))+		return 0;++	skb = (struct sk_buff *) list;+	while (!skb_queue_is_last(list, skb)) {+		skb = skb_queue_next(list, skb);+		err = tcp_fec_xmit(sk, skb);+		if (err)+			return err;+	}++	return 0;+}++/* Transmits an FEC packet */+static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb)+{+	/* TODO timers? no retransmissions, but want to deactivate FEC+	 * if we never get any FEC ACKs back+	 */+	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);+}diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.cindex f83ddf9..b640461 100644--- a/net/ipv4/tcp_input.c+++ b/net/ipv4/tcp_input.c@@ -70,6 +70,7 @@ #include <linux/kernel.h> #include <net/dst.h> #include <net/tcp.h>+#include <net/tcp_fec.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h>@@ -106,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/ #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/ #define FLAG_ECE		0x40 /* ECE in this ACK				*/+#define FLAG_FEC_CWR_REQUESTED	0x80 /* cwnd reduction requested */ #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/ #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */@@ -115,8 +117,9 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;  #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)-#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)+#define FLAG_CA_ALERT	(FLAG_DATA_SACKED|FLAG_ECE|FLAG_FEC_CWR_REQUESTED) #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)+#define FLAG_CONGESTION		(FLAG_ECE|FLAG_FEC_CWR_REQUESTED)  #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))@@ -2546,7 +2549,11 @@ void tcp_enter_cwr(struct sock *sk) 	struct tcp_sock *tp = tcp_sk(sk);  	tp->prior_ssthresh = 0;-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {+	+	/*+	 *      !!! TCP FEC patch !!!+	 */+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR && after(tp->snd_una, tp->high_seq)) { 		tp->undo_marker = 0; 		tcp_init_cwnd_reduction(sk); 		tcp_set_ca_state(sk, TCP_CA_CWR);@@ -2968,6 +2975,12 @@ void tcp_rearm_rto(struct sock *sk) 	if (tp->fastopen_rsk) 		return; +	/* Don't rearm the timer if an FEC timer is active.+	 * The FEC handler will rearm the timer once the event is handled.+	 */+	if (icsk->icsk_pending == ICSK_TIME_FEC)+		return;+ 	if (!tp->packets_out) { 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 	} else {@@ -3228,16 +3241,23 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) {+	const struct tcp_sock *tp = tcp_sk(sk); 	if (tcp_in_cwnd_reduction(sk)) 		return false; +	/*+	 *      !!! TCP FEC patch !!!+	 */+	if ((flag & FLAG_CONGESTION) && !(tp->snd_cwnd < tp->snd_ssthresh))+		return false;+ 	/* If reordering is high then always grow cwnd whenever data is 	 * delivered regardless of its ordering. Otherwise stay conservative 	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ 	 * new SACK or ECE mark may first advance cwnd here and later reduce 	 * cwnd in tcp_fastretrans_alert() based on more states. 	 */-	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)+	if (tp->reordering > sysctl_tcp_reordering) 		return flag & FLAG_FORWARD_PROGRESS;  	return flag & FLAG_DATA_ACKED;@@ -3425,6 +3445,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 		icsk->icsk_retransmits = 0; 	} +	/* Check if FEC expects and executes a window reduction */+	if (tcp_fec_is_enabled(tp) && tcp_fec_check_ack(sk, ack))+		flag |= FLAG_FEC_CWR_REQUESTED;+ 	prior_fackets = tp->fackets_out;  	/* ts_recent update must be made after we are sure that the packet@@ -3656,6 +3680,20 @@ void tcp_parse_options(const struct sk_buff *skb, 				break;  			case TCPOPT_EXP:+				/*+				 *      !!! TCP FEC patch !!!+				 */+				if (sysctl_tcp_fec &&+				    get_unaligned_be16(ptr) ==+				    TCPOPT_FEC_MAGIC) {+					tcp_fec_decode_option(&(opt_rx->fec),+						ntohl(th->seq),+						ntohl(th->ack_seq), th->syn,+						ptr + 2,+						opsize - TCPOLEN_EXP_FEC_BASE);+					break;+				}+ 				/* Fast Open option shares code 254 using a 				 * 16 bits magic number. 				 */@@ -4173,6 +4211,12 @@ static void tcp_ofo_queue(struct sock *sk) 			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 			   TCP_SKB_CB(skb)->end_seq); +		/*+		 *      !!! TCP FEC patch !!!+		 */+		if (tcp_fec_is_enabled(tp))+			tcp_fec_update_queue(sk, skb);+ 		tail = skb_peek_tail(&sk->sk_receive_queue); 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;@@ -4410,6 +4454,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 			goto out_of_window;  		/* Ok. In sequence. In window. */+		if (tcp_fec_is_enabled(tp))+			tcp_fec_update_queue(sk, skb);+ 		if (tp->ucopy.task == current && 		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && 		    sock_owned_by_user(sk) && !tp->urg_data) {@@ -4715,6 +4762,12 @@ static int tcp_prune_queue(struct sock *sk) 			     tp->copied_seq, tp->rcv_nxt); 	sk_mem_reclaim(sk); +	/* Disable FEC if it was enabled to prevent keeping data+	 * in the receive queue longer than necessary+	 */+	if (tcp_fec_is_enabled(tp))+		tcp_fec_disable(sk);+ 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) 		return 0; @@ -4998,6 +5051,21 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 		/* Reset is accepted even if it did not pass PAWS. */ 	} +	/* Special processing if FEC is enabled */+	if (tcp_fec_is_enabled(tp)) {+		if (tcp_fec_is_encoded(tp)) {+			tcp_fec_process(sk, skb);+			goto discard;+		} else if (!tp->rx_opt.fec.saw_fec && th->ack &&+			   sk->sk_state == TCP_LAST_ACK) {+			/* TODO Sometimes the FEC option is not appended to the+			 * FIN-ACK packet; socket options cleared?+			 */+			tcp_ack(sk, skb, FLAG_SLOWPATH);+			goto discard;+		}+	}+ 	/* Step 1: check sequence number */ 	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { 		/* RFC793, page 37: "In all states except SYN-SENT, all reset@@ -5099,6 +5167,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, 	 */  	tp->rx_opt.saw_tstamp = 0;+	tp->rx_opt.fec.saw_fec = 0;  	/*	pred_flags is 0xS?10 << 16 + snd_wnd 	 *	if header_prediction is to be made@@ -5461,6 +5530,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 		if (tcp_is_sack(tp) && sysctl_tcp_fack) 			tcp_enable_fack(tp); +		/*+		 * FEC negotiation+		 * Disable FEC if both ends do not agree on the FEC type used+		 */+		if (tp->fec.type != tp->rx_opt.fec.type) {+			tp->fec.type = 0;+			tp->rx_opt.fec.type = 0;+		}+ 		tcp_mtup_init(sk); 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 		tcp_initialize_rcv_mss(sk);@@ -5735,6 +5813,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		tcp_initialize_rcv_mss(sk); 		tcp_fast_path_on(tp);++		/* SYN requested FEC usage */+		if (tp->rx_opt.fec.type > 0)+			tp->fec.type = tp->rx_opt.fec.type;+ 		break;  	case TCP_FIN_WAIT1: {diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.cindex d886b60..5efbc2e 100644--- a/net/ipv4/tcp_ipv4.c+++ b/net/ipv4/tcp_ipv4.c@@ -73,6 +73,9 @@ #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h>++#include <net/tcp_fec.h>+ #include <net/tcp_memcontrol.h> #include <net/busy_poll.h> @@ -212,6 +215,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; +	memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));+ 	/* Socket identity is still unknown (sport may be zero). 	 * However we set state to SYN-SENT and not releasing socket 	 * lock select source port, enter ourselves into the hash tables and@@ -2270,7 +2275,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  	if (icsk->icsk_pending == ICSK_TIME_RETRANS || 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||-	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||+	    icsk->icsk_pending == ICSK_TIME_FEC) { 		timer_active	= 1; 		timer_expires	= icsk->icsk_timeout; 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.cindex 927586e..f59faf9 100644--- a/net/ipv4/tcp_minisocks.c+++ b/net/ipv4/tcp_minisocks.c@@ -552,6 +552,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, 		newtp->fastopen_rsk = NULL; 		newtp->syn_data_acked = 0; +		newtp->high_seq = newtp->snd_nxt;++		/* TCP FEC option */+		newtp->rx_opt.fec.type = sysctl_tcp_fec ? req->fec_type : 0;+		newtp->fec.type = newtp->fec.flags = 0;+		newtp->fec.next_seq = newtp->snd_nxt;+		newtp->fec.bytes_rcv_queue = 0;+		skb_queue_head_init(&newtp->fec.rcv_queue);+ 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 	} 	return newsk;diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.cindex ddd2a6f..7791899 100644--- a/net/ipv4/tcp_output.c+++ b/net/ipv4/tcp_output.c@@ -37,6 +37,7 @@ #define pr_fmt(fmt) "TCP: " fmt  #include <net/tcp.h>+#include <net/tcp_fec.h>  #include <linux/compiler.h> #include <linux/gfp.h>@@ -65,6 +66,12 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior.  */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +/*+ *      !!! TCP FEC patch !!!+ */+int sysctl_tcp_fec __read_mostly;++ unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); @@ -422,6 +429,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_MD5		(1 << 2) #define OPTION_WSCALE		(1 << 3) #define OPTION_FAST_OPEN_COOKIE	(1 << 8)+#define OPTION_FEC		(1 << 9)  struct tcp_out_options { 	u16 options;		/* bit field of OPTION_* */@@ -432,6 +440,7 @@ struct tcp_out_options { 	__u8 *hash_location;	/* temporary pointer, overloaded */ 	__u32 tsval, tsecr;	/* need to include OPTION_TS */ 	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */+	struct tcp_fec fec;	/* FEC parameters */ };  /* Write previously computed TCP options to the packet.@@ -540,6 +549,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 		} 		ptr += (len + 3) >> 2; 	}++	if (unlikely(OPTION_FEC & options))+		tcp_fec_encode_option(tp, &(opts->fec), &ptr); }  /* Compute TCP options for SYN packets. This is not the final@@ -607,6 +619,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, 		} 	} +	/* Prepare for FEC negotation if requested */+	if (unlikely(tcp_fec_is_enabled(tp)) &&+	    remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {+		opts->options |= OPTION_FEC;+		opts->fec.type = tp->fec.type;+		remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;+	}+ 	return MAX_TCP_OPTION_SPACE - remaining; } @@ -671,6 +691,16 @@ static unsigned int tcp_synack_options(struct sock *sk, 		} 	} +	/* Handle request for FEC support from other side+	 * (respond with same FEC option if FEC is locally supported)+	 */+	if (sysctl_tcp_fec && unlikely(req->fec_type) &&+	    remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {+		opts->options |= OPTION_FEC;+		opts->fec.type = req->fec_type;+		remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;+	}+ 	return MAX_TCP_OPTION_SPACE - remaining; } @@ -681,6 +711,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb 					struct tcp_out_options *opts, 					struct tcp_md5sig_key **md5) {+	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; 	struct tcp_sock *tp = tcp_sk(sk); 	unsigned int size = 0; 	unsigned int eff_sacks;@@ -715,6 +746,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb 			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; 	} +	/* Prepare option if connection has FEC enabled */+	if (tcp_fec_is_enabled(tp)) {+		opts->options |= OPTION_FEC;+		if (tcb && tcb->fec)+			opts->fec = *(tcb->fec);++		/* regardless of packet type we need 4 more bytes+		 * including alignment+		 */+		size += 4;+		size += TCPOLEN_EXP_FEC_BASE;+	}+ 	return size; } @@ -895,7 +939,7 @@ void tcp_wfree(struct sk_buff *skb)  * We are working here with either a clone of the original  * SKB, or a fresh unique copy made by the retransmit engine.  */-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, 			    gfp_t gfp_mask) { 	const struct inet_connection_sock *icsk = inet_csk(sk);@@ -2055,6 +2099,9 @@ repair: 			break; 	} +	if (tcp_fec_is_enabled(tp))+		tcp_fec_invoke(sk);+ 	if (likely(sent_pkts)) { 		if (tcp_in_cwnd_reduction(sk)) 			tp->prr_out += sent_pkts;@@ -3153,6 +3200,12 @@ int tcp_connect(struct sock *sk) 	 */ 	tp->snd_nxt = tp->write_seq; 	tp->pushed_seq = tp->write_seq;++	/* Initialize FEC members */+	tp->fec.next_seq = tp->snd_nxt;+	tp->fec.bytes_rcv_queue = 0;+	skb_queue_head_init(&tp->fec.rcv_queue);+ 	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);  	/* Timer for repeating the SYN until an answer. */diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.cindex dceaacc..b78ea8f 100644--- a/net/ipv4/tcp_timer.c+++ b/net/ipv4/tcp_timer.c@@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h>+#include <net/tcp_fec.h>  int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;@@ -478,7 +479,15 @@ out_reset_timer: 	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) 		__sk_dst_reset(sk); -out:;+out:+	/* FEC will switch out the RTO timer if a delayed FEC transmission+	 * should happen earlier than this. RTO timer will be switched in+	 * once the FEC timer fired.+	 * FEC transmissions during a loss episode require that the sysctl+	 * value is >= 2.+	 */+	if (tcp_fec_is_enabled(tp) && sysctl_tcp_fec >= 2)+		tcp_fec_arm_timer(sk); }  void tcp_write_timer_handler(struct sock *sk)@@ -503,6 +512,9 @@ void tcp_write_timer_handler(struct sock *sk) 	case ICSK_TIME_LOSS_PROBE: 		tcp_send_loss_probe(sk); 		break;+	case ICSK_TIME_FEC:+		tcp_fec_timer(sk);+		break; 	case ICSK_TIME_RETRANS: 		icsk->icsk_pending = 0; 		tcp_retransmit_timer(sk);diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.cindex c5078c5..d5205c6 100644--- a/net/ipv6/tcp_ipv6.c+++ b/net/ipv6/tcp_ipv6.c@@ -288,6 +288,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,  	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); +	memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));+ 	inet->inet_dport = usin->sin6_port;  	tcp_set_state(sk, TCP_SYN_SENT);