tcp拥塞算法分析四（bbr）

本文分析linux-4.14.69代码的bbr 拥塞算法

bbr算是一个完全独立的拥塞算法，具有自己的拥塞状态机．tcp_cong_control函数已经被bbr_main函数接管了．()

/* The "ultimate" congestion control function that aims to replace the rigid
 * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
 * It's called toward the end of processing an ACK with precise rate
 * information. All transmission or retransmission are delayed afterwards.
 */
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
			     int flag, const struct rate_sample *rs)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
    // bbr有实现cong_control
	if (icsk->icsk_ca_ops->cong_control) {
		icsk->icsk_ca_ops->cong_control(sk, rs);
		return;
	}

	if (tcp_in_cwnd_reduction(sk)) {
		/* Reduce cwnd if state mandates */
		tcp_cwnd_reduction(sk, acked_sacked, flag);
	} else if (tcp_may_raise_cwnd(sk, flag)) {
		/* Advance cwnd if state allows */
		tcp_cong_avoid(sk, ack, acked_sacked);
	}
	tcp_update_pacing_rate(sk);
}


static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
	.flags		= TCP_CONG_NON_RESTRICTED,
	.name		= "bbr",
	.owner		= THIS_MODULE,
	.init		= bbr_init,
	.cong_control	= bbr_main,
	.sndbuf_expand	= bbr_sndbuf_expand,
	.undo_cwnd	= bbr_undo_cwnd,
	.cwnd_event	= bbr_cwnd_event,
	.ssthresh	= bbr_ssthresh,
	.tso_segs_goal	= bbr_tso_segs_goal,
	.get_info	= bbr_get_info,
	.set_state	= bbr_set_state,
};

bbr_main函数核心是bbr_update_model


static void bbr_main(struct sock *sk, const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);
	u32 bw;

	bbr_update_model(sk, rs);

	bw = bbr_bw(sk);
	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
	bbr_set_tso_segs_goal(sk);
	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
}

static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
	bbr_update_bw(sk, rs);
	bbr_update_cycle_phase(sk, rs);
	bbr_check_full_bw_reached(sk, rs);
	bbr_check_drain(sk, rs);
	bbr_update_min_rtt(sk, rs);
}

bbr_update_bw根据采样计算带宽（rs->delivered/rs->interval_us）．如果到了新的一轮（rs->prior_delivered>=bbr->next_rtt_delivered）,更新下一个周期初始delivered，轮数+1．然后是long-term检测（用于处理流量监管丢包，如果两次丢包严重就会进入long-term阶段），最后就是更新带宽采样（bbr->bw是minmax类型，一个三元数组，会不断更新用于计算最大带宽）．

rate_sample用于跟踪一个周期（interval）内的delivered，用于计算带宽．其中prior_delivered是在一个采样周期开始时tp->deliverd的大小， tcp_sock中的delivered的值则是一个累加值，通过这个值的大小来判断是否到了新的周期．

/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
	u64  prior_mstamp; /* starting timestamp for interval */
	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
	s32  delivered;		/* number of packets delivered over interval */
	long interval_us;	/* time for tp->delivered to incr "delivered" */
	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
	int  losses;		/* number of packets marked lost upon ACK */
	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
	u32  prior_in_flight;	/* in flight before this ACK */
	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
	bool is_retrans;	/* is sample from retransmission? */
};

/* Estimate the bandwidth based on how fast packets are delivered */
static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	u64 bw;

	bbr->round_start = 0;
	if (rs->delivered < 0 || rs->interval_us <= 0)
		return; /* Not a valid observation */

	/* See if we've reached the next RTT */
    /* rs->prior_delivered为周期开始处的tp->delivered, bbr->next_rtt_delivered为下一个周期的tp->delivered？？
    如果rs->priorr_delivered>=bbr->next_rtt_deliverd，即到了下一个周期，
　　更新新的下一个周期的开始处的传输数据next_rtt_delivered和轮数rtt_cnt*/
	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
		bbr->next_rtt_delivered = tp->delivered;
		bbr->rtt_cnt++;
		bbr->round_start = 1;
		bbr->packet_conservation = 0;
	}

	bbr_lt_bw_sampling(sk, rs);

	/* Divide delivered by the interval to find a (lower bound) bottleneck
	 * bandwidth sample. Delivered is in packets and interval_us in uS and
	 * ratio will be <<1 for most connections. So delivered is first scaled.
	 */
	bw = (u64)rs->delivered * BW_UNIT;
	do_div(bw, rs->interval_us);

	/* If this sample is application-limited, it is likely to have a very
	 * low delivered count that represents application behavior rather than
	 * the available network rate. Such a sample could drag down estimated
	 * bw, causing needless slow-down. Thus, to continue to send at the
	 * last measured network rate, we filter out app-limited samples unless
	 * they describe the path bw at least as well as our bw model.
	 *
	 * So the goal during app-limited phase is to proceed with the best
	 * network rate no matter how long. We automatically leave this
	 * phase when app writes faster than the network can deliver :)
	 */
	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
		/* Incorporate new sample into our max bw filter. */
		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
	}
}

long-term处理占了bbr代码很大一部分．用来处理traffic policers主动丢包（如路由器交换机会在某些场景下，随即丢包）．如注释中说到的，当看到两次连续采样间隔，吞吐量不变并有大量丢包，就认为有traffic policers主动丢包，进入long-term状态，避免进一步大量丢包


/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
 * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
 * explicitly models their policed rate, to reduce unnecessary losses. We
 * estimate that we're policed if we see 2 consecutive sampling intervals with
 * consistent throughput and high packet loss. If we think we're being policed,
 * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
 */
static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	u32 lost, delivered;
	u64 bw;
	u32 t;
/* 
如果已经进入longterm状态
　　如果处于bbr带宽探测阶段，且进入新周期，longterm时间已经超过了48个周期，就重置lt采样，切换到bbr带宽探测阶段
退出longterm状态
*/
	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
		}
		return;
	}
/*
     如果lt_is_sampling==fasle,即没有采样
     　　如果rs->losses==0,即没有丢包，则退出long-term检测
　　　　　开始一个新的long-term采样．更新采样标签
*/
	/* Wait for the first loss before sampling, to let the policer exhaust
	 * its tokens and estimate the steady-state rate allowed by the policer.
	 * Starting samples earlier includes bursts that over-estimate the bw.
	 */
	if (!bbr->lt_is_sampling) {
		if (!rs->losses)
			return;
		bbr_reset_lt_bw_sampling_interval(sk);
		bbr->lt_is_sampling = true;
	}
/* 如果是应用层数据限制，就重新采样 */
	/* To avoid underestimates, reset sampling if we run out of data. */
	if (rs->is_app_limited) {
		bbr_reset_lt_bw_sampling(sk);
		return;
	}
/* 采样周期数需要在(4,16)之间 */
	if (bbr->round_start)
		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
		return;		/* sampling interval needs to be longer */
	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
		return;
	}
    /* 如果rs->losses==0,即没有丢包，则退出long-term检测 */
	/* End sampling interval when a packet is lost, so we estimate the
	 * policer tokens were exhausted. Stopping the sampling before the
	 * tokens are exhausted under-estimates the policed rate.
	 */
	if (!rs->losses)
		return;
 
	/* Calculate packets lost and delivered in sampling interval. */
	lost = tp->lost - bbr->lt_last_lost;
	delivered = tp->delivered - bbr->lt_last_delivered;
	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
		return;

	/* Find average delivery rate in this sampling interval. */
	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
	if ((s32)t < 1)
		return;		/* interval is less than one ms, so wait */
	/* Check if can multiply without overflow */
	if (t >= ~0U / USEC_PER_MSEC) {
		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
		return;
	}
	t *= USEC_PER_MSEC;
	bw = (u64)delivered * BW_UNIT;
	do_div(bw, t);
	bbr_lt_bw_interval_done(sk, bw);
}

带宽探测阶段利用增益系数数组探测带宽．

/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
static void bbr_update_cycle_phase(struct sock *sk,
				   const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);

	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
		bbr_advance_cycle_phase(sk);
}

/*
    利用增益系数数组pacing_gain[5/4, 3/4, 1, 1, 1, 1, 1, 1]探测带宽
    如果是稳定阶段,pacing_gain=1,时长超过min_rtt_us就进入下一轮
    如果是激进阶段,pacing_gain>1,必须时间够了，且有丢包或inflight>目标窗口才进入下一轮
    如果是排空阶段,pacing_gain<1,时间够了，或者inflight<=目标窗口就进入下一轮
*/
/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
static bool bbr_is_next_cycle_phase(struct sock *sk,
                    const struct rate_sample *rs)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct bbr *bbr = inet_csk_ca(sk);
    /*  
        如果delivered_mstamp-cycle_mstamp>min_rtt_us，
        即这一轮带宽探测时长够了.
        标记is_full_length=true，否则is_full_length=false 
    */
    bool is_full_length =
        tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
        bbr->min_rtt_us;
    u32 inflight, bw; 

    /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
     * use the pipe without increasing the queue.
     */
    if (bbr->pacing_gain == BBR_UNIT)
        return is_full_length;      /* just use wall clock time */

    inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
    bw = bbr_max_bw(sk);

    /*  
        如果处于探测更大带宽的周期(pacing_gain=5/4)
            如果没有丢包，
                尝试一直处在该周期直到窗口增加到目标窗口(pacing_gain*BDP)
    */
    /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
     * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
     * small (e.g. on a LAN). We do not persist if packets are lost, since
     * a path with small buffers may not hold that much.
     */
    if (bbr->pacing_gain > BBR_UNIT)
        return is_full_length &&
            (rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
             inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));

    /*  
        如果处于排空队列周期(pacing_gain=3/4),探测时长够了或者
        inflight小于目标窗口可以提前退出这个周期
    */  
    /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
     * probing didn't find more bw. If inflight falls to match BDP then we
     * estimate queue is drained; persisting would underutilize the pipe.
     */
    return is_full_length ||
        inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
}

慢启动阶段利用带宽变化判断pipe是否满了，带宽是否到了最大值

/*
	检测慢启动阶段带宽是否已经到了最大值．
	如果连续三次检查到带宽增长速度小于bbr_full_bw_thresh(25%),
	就认为pipe满了,带宽到了最大值
*/
/* Estimate when the pipe is full, using the change in delivery rate: BBR
 * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
 * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
 * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
 * higher rwin, 3: we get higher delivery rate samples. Or transient
 * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
 * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
 */
static void bbr_check_full_bw_reached(struct sock *sk,
				      const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);
	u32 bw_thresh;
	/*
		如果已经标记了带宽满了，或者round_start=0,即还未开始测，或者应用层限制住了，直接退出
	*/
	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
		return;
	/*
		如果bbr现在的最大带宽比最近带宽多了设定的阈值bbr_full_bw_thresh，
		更新最近带宽bbr->full_bw，并把full_bw_cnt清0
		否则full_bw_cnt+1，full_bw_cnt超过3就表示带宽满了
	*/
	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
	if (bbr_max_bw(sk) >= bw_thresh) {
		bbr->full_bw = bbr_max_bw(sk);
		bbr->full_bw_cnt = 0;
		return;
	}
	++bbr->full_bw_cnt;
	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
}

如果STARTUP状态带宽增到最大，切换到DRAIN状态；如果DRAIN阶段队列清空，切换到STARTUP状态

/* If pipe is probably full, drain the queue and then enter steady-state. */
static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);
	
	/*
		如果慢启动阶段带宽满了(窗口不变，速度减小)
			更新状态机到排空状态
			更新pacing_gain
			保持cwnd_gain
	*/
	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
		bbr->mode = BBR_DRAIN;	/* drain queue we created */
		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
	}	/* fall through to check if in-flight is already small: */
	/*
		如果排空阶段inflight<=target，即队列已经清空，切换到带宽探测状态机
	*/
	if (bbr->mode == BBR_DRAIN &&
	    tcp_packets_in_flight(tcp_sk(sk)) <=
	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
}

检测是否该进入rtt探测状态以及相应参数更新

/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
 * periodically drain the bottleneck queue, to converge to measure the true
 * min_rtt (unloaded propagation delay). This allows the flows to keep queues
 * small (reducing queuing delay and packet loss) and achieve fairness among
 * BBR flows.
 *
 * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
 * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
 * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
 * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
 * re-enter the previous mode. BBR uses 200ms to approximately bound the
 * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
 *
 * Note that flows need only pay 2% if they are busy sending over the last 10
 * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
 * natural silences or low-rate periods within 10 seconds where the rate is low
 * enough for long enough to drain its queue in the bottleneck. We pick up
 * these min RTT measurements opportunistically with our min_rtt filter. :-)
 */
static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);
	bool filter_expired;
	/*
		最小rtt有效时间为bbr_min_rtt_win_sec*HZ, 即10s, 如果有效时间过了或者新采的rtt更小，
		更新最小rtt大小和最小rtt发生的时间
	*/
	/* Track min RTT seen in the min_rtt_win_sec filter window: */
	filter_expired = after(tcp_jiffies32,
			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
	if (rs->rtt_us >= 0 &&
	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
		bbr->min_rtt_us = rs->rtt_us;
		bbr->min_rtt_stamp = tcp_jiffies32;
	}
	/*
如果开启了rtt探测功能，且最小rtt有效时间过了(也可以理解为rtt探测周期到了)，
且idle_restart==0（不是从空闲状态重启的），且当前不处在rtt探测状态BBR_PROBE_RTT：
	设置状态机为BBR_PROBE_RTT，减小发送速度和发送窗口，保留之前窗口用来恢复，
	rtt探测结束时间标记为无效值0（后面会设置具体有效值）		
	*/
	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
		bbr->pacing_gain = BBR_UNIT;
		bbr->cwnd_gain = BBR_UNIT;
		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
		bbr->probe_rtt_done_stamp = 0;
	}
	/*
如果处于rtt探测状态，更新限制
	如果probe_rtt_done_stamp=0（结束标记无效），且网络中的包少于bbr_cwnd_min_target
		更新rtt探测结束时间,设置probe_rtt_round_done=0(标记rtt探测还没有开始做过),更新下一个rtt的delivered
	否则如果probe_rtt_done_stamp!=0（结束标记有效）
		如果round_start=1
			标记probe_rtt_round_done=1（rtt探测已经开始做了）
		如果rtt探测已经生做过了，而且探测时长到了
			更新最小rtt计算的时间（用于判断有没有过期，以重新进入rtt探测周期）
			标记恢复窗口
			重置模型
	*/
	if (bbr->mode == BBR_PROBE_RTT) {
		/* Ignore low rate samples during this mode. */
		tp->app_limited =
			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
		/* Maintain min packets in flight for max(200 ms, 1 round). */
		if (!bbr->probe_rtt_done_stamp &&
		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
			bbr->probe_rtt_round_done = 0;
			bbr->next_rtt_delivered = tp->delivered;
		} else if (bbr->probe_rtt_done_stamp) {
			if (bbr->round_start)
				bbr->probe_rtt_round_done = 1;
			if (bbr->probe_rtt_round_done &&
			    after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) {
				bbr->min_rtt_stamp = tcp_jiffies32;
				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
				bbr_reset_mode(sk);
			}
		}
	}
	/* Restart after idle ends only once we process a new S/ACK for data */
	if (rs->delivered > 0)
		bbr->idle_restart = 0;
}

tcp拥塞算法分析四（bbr）

相关文章

使用XDP eXpress Data Path 防御DDoS攻击

***PHP中判断变量为空的几种方法

OS X 10.10安装 Ruby on Rails V4.2.1教程

Python网络质量测试工具增加乱序统计

php函数ob_start()、ob_end_clean()、ob_get_contents()

深夜聊聊Bufferbloat以及TCP BBR

棋盘覆盖（一） ACM

IOS5基础教程之一-----如何创建XCode项目