Commit c1b4a7e6 authored by David S. Miller's avatar David S. Miller

[TCP]: Move to new TSO segmenting scheme.

Make TSO segment transmit size decisions at send time not earlier.

The basic scheme is that we try to build as large a TSO frame as
possible when pulling in the user data, but the size of the TSO frame
output to the card is determined at transmit time.

This is guided by tp->xmit_size_goal.  It is always set to a multiple
of MSS and tells sendmsg/sendpage how large an SKB to try and build.

Later, tcp_write_xmit() and tcp_push_one() chop up the packet if
necessary and conditions warrant.  These routines can also decide to
"defer" in order to wait for more ACKs to arrive and thus allow larger
TSO frames to be emitted.

A general observation is that TSO elongates the pipe, thus requiring a
larger congestion window and larger buffering especially at the sender
side.  Therefore, it is important that applications 1) get a large
enough socket send buffer (this is accomplished by our dynamic send
buffer expansion code) 2) do large enough writes.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0d9901df
......@@ -286,7 +286,7 @@ struct tcp_sock {
__u32 max_window; /* Maximal window ever seen from peer */
__u32 pmtu_cookie; /* Last pmtu seen by socket */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u16 mss_cache_std; /* Like mss_cache, but without TSO */
__u16 xmit_size_goal; /* Goal for segmenting output packets */
__u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
__u8 ca_state; /* State of fast-retransmit machine */
__u8 retransmits; /* Number of unrecovered RTO timeouts. */
......
......@@ -862,7 +862,7 @@ extern int tcp_write_wakeup(struct sock *);
extern void tcp_send_fin(struct sock *sk);
extern void tcp_send_active_reset(struct sock *sk, int priority);
extern int tcp_send_synack(struct sock *);
extern void tcp_push_one(struct sock *, unsigned mss_now);
extern void tcp_push_one(struct sock *, unsigned int mss_now);
extern void tcp_send_ack(struct sock *sk);
extern void tcp_send_delayed_ack(struct sock *sk);
......@@ -968,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
static inline void tcp_initialize_rcv_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min(tp->advmss, tp->mss_cache_std);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd/2);
hint = min(hint, TCP_MIN_RCVMSS);
......
......@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
size_t psize, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now;
int mss_now, size_goal;
int err;
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
......@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
copied = 0;
err = -EPIPE;
......@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
......@@ -652,7 +653,7 @@ new_segment:
goto wait_for_memory;
skb_entail(sk, tp, skb);
copy = mss_now;
copy = size_goal;
}
if (copy > size)
......@@ -693,7 +694,7 @@ new_segment:
if (!(psize -= copy))
goto out;
if (skb->len != mss_now || (flags & MSG_OOB))
if (skb->len < mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
......@@ -713,6 +714,7 @@ wait_for_memory:
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
}
out:
......@@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
static inline int select_size(struct sock *sk, struct tcp_sock *tp)
{
int tmp = tp->mss_cache_std;
int tmp = tp->mss_cache;
if (sk->sk_route_caps & NETIF_F_SG) {
if (sk->sk_route_caps & NETIF_F_TSO)
......@@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
int mss_now;
int mss_now, size_goal;
int err, copied;
long timeo;
......@@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
......@@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
skb = sk->sk_write_queue.prev;
if (!sk->sk_send_head ||
(copy = mss_now - skb->len) <= 0) {
(copy = size_goal - skb->len) <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
......@@ -842,7 +845,7 @@ new_segment:
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
copy = mss_now;
copy = size_goal;
}
/* Try to append data to the end of skb. */
......@@ -937,7 +940,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
if (skb->len != mss_now || (flags & MSG_OOB))
if (skb->len < mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
......@@ -957,6 +960,7 @@ wait_for_memory:
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
}
}
......@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rto = jiffies_to_usecs(tp->rto);
info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
info->tcpi_snd_mss = tp->mss_cache_std;
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = tp->ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
......@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache_std;
val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
break;
......
......@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd) {
if (tp->mss_cache_std > 1460)
if (tp->mss_cache > 1460)
cwnd = 2;
else
cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
cwnd = (tp->mss_cache > 1095) ? 3 : 4;
}
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
......@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sk->sk_route_caps & NETIF_F_TSO) {
sk->sk_route_caps &= ~NETIF_F_TSO;
sock_set_flag(sk, SOCK_NO_LARGESEND);
tp->mss_cache = tp->mss_cache_std;
tp->mss_cache = tp->mss_cache;
}
if (!tp->sacked_out)
......@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
tp->mss_cache_std))) {
tp->mss_cache))) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
......@@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk, tp)) {
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
demanded = max_t(unsigned int, tp->snd_cwnd,
tp->reordering + 1);
......
......@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache_std = tp->mss_cache = 536;
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
tp->ca_ops = &tcp_init_congestion_ops;
......
This diff is collapsed.
......@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache_std = tp->mss_cache = 536;
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment