Commit dfb4b9dc authored by David S. Miller's avatar David S. Miller

[TCP] Vegas: timestamp before clone

We have to store the congestion control timestamp on the SKB before we
clone it, not after.  Else we get no timestamping information at all.

tcp_transmit_skb() has been reworked so that we can do the timestamp
still in one spot, instead of at all the call sites.

Problem discovered, and initial fix, from Tom Young
<tyo@ee.unimelb.edu.au>.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0d7bef60
...@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) ...@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
* We are working here with either a clone of the original * We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine. * SKB, or a fresh unique copy made by the retransmit engine.
*/ */
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{ {
if (skb != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet;
struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp;
struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb;
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); int tcp_header_size;
int tcp_header_size = tp->tcp_header_len; struct tcphdr *th;
struct tcphdr *th; int sysctl_flags;
int sysctl_flags; int err;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
/* If congestion control is doing timestamping, we must
* take such a timestamp before we potentially clone/copy.
*/
if (icsk->icsk_ca_ops->rtt_sample)
__net_timestamp(skb);
if (likely(clone_it)) {
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
BUG_ON(!tcp_skb_pcount(skb)); inet = inet_sk(sk);
tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
tcp_header_size = tp->tcp_header_len;
#define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_TSTAMPS 0x1
#define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4 #define SYSCTL_FLAG_SACK 0x4
/* If congestion control is doing timestamping */ sysctl_flags = 0;
if (icsk->icsk_ca_ops->rtt_sample) if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
__net_timestamp(skb); tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
if(sysctl_tcp_timestamps) {
sysctl_flags = 0; tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
if (tcb->flags & TCPCB_FLAG_SYN) { sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
if(sysctl_tcp_timestamps) {
tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
}
if(sysctl_tcp_window_scaling) {
tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_WSCALE;
}
if(sysctl_tcp_sack) {
sysctl_flags |= SYSCTL_FLAG_SACK;
if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
}
} else if (tp->rx_opt.eff_sacks) {
/* A SACK is 2 pad bytes, a 2 byte header, plus
* 2 32-bit sequence numbers for each SACK block.
*/
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
(tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
} }
if (sysctl_tcp_window_scaling) {
if (tcp_packets_in_flight(tp) == 0) tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
tcp_ca_event(sk, CA_EVENT_TX_START); sysctl_flags |= SYSCTL_FLAG_WSCALE;
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
skb_set_owner_w(skb, sk);
/* Build TCP header and checksum it. */
th->source = inet->sport;
th->dest = inet->dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
if (tcb->flags & TCPCB_FLAG_SYN) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(tp->rcv_wnd);
} else {
th->window = htons(tcp_select_window(sk));
} }
th->check = 0; if (sysctl_tcp_sack) {
th->urg_ptr = 0; sysctl_flags |= SYSCTL_FLAG_SACK;
if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
if (tp->urg_mode && tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
th->urg_ptr = htons(tp->snd_up-tcb->seq);
th->urg = 1;
} }
} else if (unlikely(tp->rx_opt.eff_sacks)) {
/* A SACK is 2 pad bytes, a 2 byte header, plus
* 2 32-bit sequence numbers for each SACK block.
*/
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
(tp->rx_opt.eff_sacks *
TCPOLEN_SACK_PERBLOCK));
}
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
skb_set_owner_w(skb, sk);
/* Build TCP header and checksum it. */
th->source = inet->sport;
th->dest = inet->dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->flags);
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(tp->rcv_wnd);
} else {
th->window = htons(tcp_select_window(sk));
}
th->check = 0;
th->urg_ptr = 0;
if (tcb->flags & TCPCB_FLAG_SYN) { if (unlikely(tp->urg_mode &&
tcp_syn_build_options((__u32 *)(th + 1), between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
tcp_advertise_mss(sk), th->urg_ptr = htons(tp->snd_up-tcb->seq);
(sysctl_flags & SYSCTL_FLAG_TSTAMPS), th->urg = 1;
(sysctl_flags & SYSCTL_FLAG_SACK), }
(sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rx_opt.rcv_wscale,
tcb->when,
tp->rx_opt.ts_recent);
} else {
tcp_build_and_update_options((__u32 *)(th + 1),
tp, tcb->when);
TCP_ECN_send(sk, tp, skb, tcp_header_size); if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
} tcp_syn_build_options((__u32 *)(th + 1),
tp->af_specific->send_check(sk, th, skb->len, skb); tcp_advertise_mss(sk),
(sysctl_flags & SYSCTL_FLAG_TSTAMPS),
(sysctl_flags & SYSCTL_FLAG_SACK),
(sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rx_opt.rcv_wscale,
tcb->when,
tp->rx_opt.ts_recent);
} else {
tcp_build_and_update_options((__u32 *)(th + 1),
tp, tcb->when);
TCP_ECN_send(sk, tp, skb, tcp_header_size);
}
if (tcb->flags & TCPCB_FLAG_ACK) tp->af_specific->send_check(sk, th, skb->len, skb);
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size) if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_data_sent(tp, skb, sk); tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
TCP_INC_STATS(TCP_MIB_OUTSEGS); if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);
err = tp->af_specific->queue_xmit(skb, 0); TCP_INC_STATS(TCP_MIB_OUTSEGS);
if (err <= 0)
return err;
tcp_enter_cwr(sk); err = tp->af_specific->queue_xmit(skb, 0);
if (unlikely(err <= 0))
return err;
tcp_enter_cwr(sk);
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
* is about to start to drop packets or already
* drops some packets of the same priority and
* invokes us to send less aggressively.
*/
return err == NET_XMIT_CN ? 0 : err;
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
* is about to start to drop packets or already
* drops some packets of the same priority and
* invokes us to send less aggressively.
*/
return err == NET_XMIT_CN ? 0 : err;
}
return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK #undef SYSCTL_FLAG_SACK
...@@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) ...@@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
break; break;
/* Advance the send_head. This one is sent out. /* Advance the send_head. This one is sent out.
...@@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) ...@@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
/* Send it out now. */ /* Send it out now. */
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
update_send_head(sk, tp, skb); update_send_head(sk, tp, skb);
tcp_cwnd_validate(sk, tp); tcp_cwnd_validate(sk, tp);
return; return;
...@@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/ */
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, (skb_cloned(skb) ? err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
pskb_copy(skb, GFP_ATOMIC):
skb_clone(skb, GFP_ATOMIC)));
if (err == 0) { if (err == 0) {
/* Update global TCP statistics. */ /* Update global TCP statistics. */
...@@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) ...@@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb)) if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
} }
...@@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) ...@@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk)
TCP_ECN_send_synack(tcp_sk(sk), skb); TCP_ECN_send_synack(tcp_sk(sk), skb);
} }
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
} }
/* /*
...@@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk) ...@@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk)
__skb_queue_tail(&sk->sk_write_queue, buff); __skb_queue_tail(&sk->sk_write_queue, buff);
sk_charge_skb(sk, buff); sk_charge_skb(sk, buff);
tp->packets_out += tcp_skb_pcount(buff); tp->packets_out += tcp_skb_pcount(buff);
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */ /* Timer for repeating the SYN until an answer. */
...@@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) ...@@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */ /* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
TCP_SKB_CB(buff)->when = tcp_time_stamp; TCP_SKB_CB(buff)->when = tcp_time_stamp;
tcp_transmit_skb(sk, buff); tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
} }
} }
...@@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) ...@@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
} }
int tcp_write_wakeup(struct sock *sk) int tcp_write_wakeup(struct sock *sk)
...@@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) ...@@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk)
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp; TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err) { if (!err) {
update_send_head(sk, tp, skb); update_send_head(sk, tp, skb);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment