bbr算是一个完全独立的拥塞算法,具有自己的拥塞状态机.tcp_cong_control函数已经被bbr_main函数接管了.()
/* The "ultimate" congestion control function that aims to replace the rigid
* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
* It's called toward the end of processing an ACK with precise rate
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
int flag, const struct rate_sample *rs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
// bbr有实现cong_control
if (icsk->icsk_ca_ops->cong_control) {
icsk->icsk_ca_ops->cong_control(sk, rs);
return;
}
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, flag);
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
tcp_cong_avoid(sk, ack, acked_sacked);
}
tcp_update_pacing_rate(sk);
}
static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "bbr",
.owner = THIS_MODULE,
.init = bbr_init,
.cong_control = bbr_main,
.sndbuf_expand = bbr_sndbuf_expand,
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event = bbr_cwnd_event,
.ssthresh = bbr_ssthresh,
.tso_segs_goal = bbr_tso_segs_goal,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
static void bbr_main(struct sock *sk, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw;
bbr_update_model(sk, rs);
bw = bbr_bw(sk);
bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
bbr_set_tso_segs_goal(sk);
bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
}
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
bbr_update_cycle_phase(sk, rs);
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
bbr_update_min_rtt(sk, rs);
}
bbr_update_bw根据采样计算带宽(rs->delivered/rs->interval_us).如果到了新的一轮(rs->prior_delivered>=bbr->next_rtt_delivered),更新下一个周期初始delivered,轮数+1.然后是long-term检测(用于处理流量监管丢包,如果两次丢包严重就会进入long-term阶段),最后就是更新带宽采样(bbr->bw是minmax类型,一个三元数组,会不断更新用于计算最大带宽).
rate_sample用于跟踪一个周期(interval)内的delivered,用于计算带宽.其中prior_delivered是在一个采样周期开始时tp->deliverd的大小, tcp_sock中的delivered的值则是一个累加值,通过这个值的大小来判断是否到了新的周期.
/* A rate sample measures the number of (original/retransmitted) data
* packets delivered "delivered" over an interval of time "interval_us".
* The tcp_rate.c code fills in the rate sample, and congestion
* control modules that define a cong_control function to run at the end
* of ACK processing can optionally chose to consult this sample when
* setting cwnd and pacing rate.
* A sample is invalid if "delivered" or "interval_us" is negative.
*/
struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
s32 delivered; /* number of packets delivered over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
int losses; /* number of packets marked lost upon ACK */
u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
u32 prior_in_flight; /* in flight before this ACK */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
};
/* Estimate the bandwidth based on how fast packets are delivered */
static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
u64 bw;
bbr->round_start = 0;
if (rs->delivered < 0 || rs->interval_us <= 0)
return; /* Not a valid observation */
/* See if we've reached the next RTT */
/* rs->prior_delivered为周期开始处的tp->delivered, bbr->next_rtt_delivered为下一个周期的tp->delivered??
如果rs->priorr_delivered>=bbr->next_rtt_deliverd,即到了下一个周期,
更新新的下一个周期的开始处的传输数据next_rtt_delivered和轮数rtt_cnt*/
if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
bbr->next_rtt_delivered = tp->delivered;
bbr->rtt_cnt++;
bbr->round_start = 1;
bbr->packet_conservation = 0;
}
bbr_lt_bw_sampling(sk, rs);
/* Divide delivered by the interval to find a (lower bound) bottleneck
* bandwidth sample. Delivered is in packets and interval_us in uS and
* ratio will be <<1 for most connections. So delivered is first scaled.
*/
bw = (u64)rs->delivered * BW_UNIT;
do_div(bw, rs->interval_us);
/* If this sample is application-limited, it is likely to have a very
* low delivered count that represents application behavior rather than
* the available network rate. Such a sample could drag down estimated
* bw, causing needless slow-down. Thus, to continue to send at the
* last measured network rate, we filter out app-limited samples unless
* they describe the path bw at least as well as our bw model.
*
* So the goal during app-limited phase is to proceed with the best
* network rate no matter how long. We automatically leave this
* phase when app writes faster than the network can deliver :)
*/
if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
/* Incorporate new sample into our max bw filter. */
minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
}
}
long-term处理占了bbr代码很大一部分. 用来处理traffic policers主动丢包(如路由器交换机会在某些场景下,随即丢包).如注释中说到的,当看到两次连续采样间隔,吞吐量不变并有大量丢包,就认为有traffic policers主动丢包,进入long-term状态,避免进一步大量丢包
/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
* Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
* explicitly models their policed rate, to reduce unnecessary losses. We
* estimate that we're policed if we see 2 consecutive sampling intervals with
* consistent throughput and high packet loss. If we think we're being policed,
* set lt_bw to the "long-term" average delivery rate from those 2 intervals.
*/
static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
u32 lost, delivered;
u64 bw;
u32 t;
/*
如果已经进入longterm状态
如果处于bbr带宽探测阶段,且进入新周期,longterm时间已经超过了48个周期,就重置lt采样,切换到bbr带宽探测阶段
退出longterm状态
*/
if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */
if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */
bbr_reset_probe_bw_mode(sk); /* restart gain cycling */
}
return;
}
/*
如果lt_is_sampling==fasle,即没有采样
如果rs->losses==0,即没有丢包,则退出long-term检测
开始一个新的long-term采样.更新采样标签
*/
/* Wait for the first loss before sampling, to let the policer exhaust
* its tokens and estimate the steady-state rate allowed by the policer.
* Starting samples earlier includes bursts that over-estimate the bw.
*/
if (!bbr->lt_is_sampling) {
if (!rs->losses)
return;
bbr_reset_lt_bw_sampling_interval(sk);
bbr->lt_is_sampling = true;
}
/* 如果是应用层数据限制,就重新采样 */
/* To avoid underestimates, reset sampling if we run out of data. */
if (rs->is_app_limited) {
bbr_reset_lt_bw_sampling(sk);
return;
}
/* 采样周期数需要在(4,16)之间 */
if (bbr->round_start)
bbr->lt_rtt_cnt++; /* count round trips in this interval */
if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
return; /* sampling interval needs to be longer */
if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
bbr_reset_lt_bw_sampling(sk); /* interval is too long */
return;
}
/* 如果rs->losses==0,即没有丢包,则退出long-term检测 */
/* End sampling interval when a packet is lost, so we estimate the
* policer tokens were exhausted. Stopping the sampling before the
* tokens are exhausted under-estimates the policed rate.
*/
if (!rs->losses)
return;
/* Calculate packets lost and delivered in sampling interval. */
lost = tp->lost - bbr->lt_last_lost;
delivered = tp->delivered - bbr->lt_last_delivered;
/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
return;
/* Find average delivery rate in this sampling interval. */
t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
if ((s32)t < 1)
return; /* interval is less than one ms, so wait */
/* Check if can multiply without overflow */
if (t >= ~0U / USEC_PER_MSEC) {
bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */
return;
}
t *= USEC_PER_MSEC;
bw = (u64)delivered * BW_UNIT;
do_div(bw, t);
bbr_lt_bw_interval_done(sk, bw);
}
带宽探测阶段利用增益系数数组探测带宽.
/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
static void bbr_update_cycle_phase(struct sock *sk,
const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
bbr_advance_cycle_phase(sk);
}
/*
利用增益系数数组pacing_gain[5/4, 3/4, 1, 1, 1, 1, 1, 1]探测带宽
如果是稳定阶段,pacing_gain=1,时长超过min_rtt_us就进入下一轮
如果是激进阶段,pacing_gain>1,必须时间够了,且有丢包或inflight>目标窗口才进入下一轮
如果是排空阶段,pacing_gain<1,时间够了,或者inflight<=目标窗口就进入下一轮
*/
/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
static bool bbr_is_next_cycle_phase(struct sock *sk,
const struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
/*
如果delivered_mstamp-cycle_mstamp>min_rtt_us,
即这一轮带宽探测时长够了.
标记is_full_length=true,否则is_full_length=false
*/
bool is_full_length =
tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
bbr->min_rtt_us;
u32 inflight, bw;
/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
* use the pipe without increasing the queue.
*/
if (bbr->pacing_gain == BBR_UNIT)
return is_full_length; /* just use wall clock time */
inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
bw = bbr_max_bw(sk);
/*
如果处于探测更大带宽的周期(pacing_gain=5/4)
如果没有丢包,
尝试一直处在该周期直到窗口增加到目标窗口(pacing_gain*BDP)
*/
/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
* least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
* small (e.g. on a LAN). We do not persist if packets are lost, since
* a path with small buffers may not hold that much.
*/
if (bbr->pacing_gain > BBR_UNIT)
return is_full_length &&
(rs->losses || /* perhaps pacing_gain*BDP won't fit */
inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
/*
如果处于排空队列周期(pacing_gain=3/4),探测时长够了或者
inflight小于目标窗口可以提前退出这个周期
*/
/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
* probing didn't find more bw. If inflight falls to match BDP then we
* estimate queue is drained; persisting would underutilize the pipe.
*/
return is_full_length ||
inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
}
慢启动阶段利用带宽变化判断pipe是否满了,带宽是否到了最大值
/*
检测慢启动阶段带宽是否已经到了最大值.
如果连续三次检查到带宽增长速度小于bbr_full_bw_thresh(25%),
就认为pipe满了,带宽到了最大值
*/
/* Estimate when the pipe is full, using the change in delivery rate: BBR
* estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
* at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
* rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
* higher rwin, 3: we get higher delivery rate samples. Or transient
* cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
* design goal, but uses delay and inter-ACK spacing instead of bandwidth.
*/
static void bbr_check_full_bw_reached(struct sock *sk,
const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw_thresh;
/*
如果已经标记了带宽满了,或者round_start=0,即还未开始测,或者应用层限制住了,直接退出
*/
if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
return;
/*
如果bbr现在的最大带宽比最近带宽多了设定的阈值bbr_full_bw_thresh,
更新最近带宽bbr->full_bw,并把full_bw_cnt清0
否则full_bw_cnt+1,full_bw_cnt超过3就表示带宽满了
*/
bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
if (bbr_max_bw(sk) >= bw_thresh) {
bbr->full_bw = bbr_max_bw(sk);
bbr->full_bw_cnt = 0;
return;
}
++bbr->full_bw_cnt;
bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
}
如果STARTUP状态带宽增到最大,切换到DRAIN状态;如果DRAIN阶段 队列清空,切换到STARTUP状态
/* If pipe is probably full, drain the queue and then enter steady-state. */
static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
/*
如果慢启动阶段带宽满了(窗口不变,速度减小)
更新状态机到排空状态
更新pacing_gain
保持cwnd_gain
*/
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
} /* fall through to check if in-flight is already small: */
/*
如果排空阶段inflight<=target,即队列已经清空,切换到带宽探测状态机
*/
if (bbr->mode == BBR_DRAIN &&
tcp_packets_in_flight(tcp_sk(sk)) <=
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
}
检测是否该进入rtt探测状态以及相应参数更新
/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
* periodically drain the bottleneck queue, to converge to measure the true
* min_rtt (unloaded propagation delay). This allows the flows to keep queues
* small (reducing queuing delay and packet loss) and achieve fairness among
* BBR flows.
*
* The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
* we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
* After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
* round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
* re-enter the previous mode. BBR uses 200ms to approximately bound the
* performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
*
* Note that flows need only pay 2% if they are busy sending over the last 10
* seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
* natural silences or low-rate periods within 10 seconds where the rate is low
* enough for long enough to drain its queue in the bottleneck. We pick up
* these min RTT measurements opportunistically with our min_rtt filter. :-)
*/
static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
bool filter_expired;
/*
最小rtt有效时间为bbr_min_rtt_win_sec*HZ, 即10s, 如果有效时间过了或者新采的rtt更小,
更新最小rtt大小和最小rtt发生的时间
*/
/* Track min RTT seen in the min_rtt_win_sec filter window: */
filter_expired = after(tcp_jiffies32,
bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
if (rs->rtt_us >= 0 &&
(rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
bbr->min_rtt_us = rs->rtt_us;
bbr->min_rtt_stamp = tcp_jiffies32;
}
/*
如果开启了rtt探测功能,且最小rtt有效时间过了(也可以理解为rtt探测周期到了),
且idle_restart==0(不是从空闲状态重启的),且当前不处在rtt探测状态BBR_PROBE_RTT:
设置状态机为BBR_PROBE_RTT,减小发送速度和发送窗口,保留之前窗口用来恢复,
rtt探测结束时间标记为无效值0(后面会设置具体有效值)
*/
if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
bbr->pacing_gain = BBR_UNIT;
bbr->cwnd_gain = BBR_UNIT;
bbr_save_cwnd(sk); /* note cwnd so we can restore it */
bbr->probe_rtt_done_stamp = 0;
}
/*
如果处于rtt探测状态,更新限制
如果probe_rtt_done_stamp=0(结束标记无效),且网络中的包少于bbr_cwnd_min_target
更新rtt探测结束时间,设置probe_rtt_round_done=0(标记rtt探测还没有开始做过),更新下一个rtt的delivered
否则如果probe_rtt_done_stamp!=0(结束标记有效)
如果round_start=1
标记probe_rtt_round_done=1(rtt探测已经开始做了)
如果rtt探测已经生做过了,而且探测时长到了
更新最小rtt计算的时间(用于判断有没有过期,以重新进入rtt探测周期)
标记恢复窗口
重置模型
*/
if (bbr->mode == BBR_PROBE_RTT) {
/* Ignore low rate samples during this mode. */
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
/* Maintain min packets in flight for max(200 ms, 1 round). */
if (!bbr->probe_rtt_done_stamp &&
tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
bbr->probe_rtt_done_stamp = tcp_jiffies32 +
msecs_to_jiffies(bbr_probe_rtt_mode_ms);
bbr->probe_rtt_round_done = 0;
bbr->next_rtt_delivered = tp->delivered;
} else if (bbr->probe_rtt_done_stamp) {
if (bbr->round_start)
bbr->probe_rtt_round_done = 1;
if (bbr->probe_rtt_round_done &&
after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) {
bbr->min_rtt_stamp = tcp_jiffies32;
bbr->restore_cwnd = 1; /* snap to prior_cwnd */
bbr_reset_mode(sk);
}
}
}
/* Restart after idle ends only once we process a new S/ACK for data */
if (rs->delivered > 0)
bbr->idle_restart = 0;
}