Skip to content

Commit 7954a7d

Browse files
spurious loss detection and recovery
1 parent a225ec8 commit 7954a7d

File tree

6 files changed

+92
-2
lines changed

6 files changed

+92
-2
lines changed

quinn-proto/src/congestion.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,16 @@ pub trait Controller: Send + Sync {
5656
now: Instant,
5757
sent: Instant,
5858
is_persistent_congestion: bool,
59+
is_ecn: bool,
5960
lost_bytes: u64,
6061
);
6162

63+
/// Packets were incorrectly deemed lost
64+
///
65+
/// This function is called when all packets that were deemed lost (for instance because
66+
/// of packet reordering) are acknowledged after the congestion event was raised.
67+
fn on_spurious_congestion_event(&mut self) {}
68+
6269
/// The known MTU for the current network path has been updated
6370
fn on_mtu_update(&mut self, new_mtu: u16);
6471

quinn-proto/src/congestion/bbr/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,7 @@ impl Controller for Bbr {
465465
_now: Instant,
466466
_sent: Instant,
467467
_is_persistent_congestion: bool,
468+
_is_ecn: bool,
468469
lost_bytes: u64,
469470
) {
470471
self.loss_state.lost_bytes += lost_bytes;

quinn-proto/src/congestion/cubic.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,10 @@ impl State {
7575
#[derive(Debug, Clone)]
7676
pub struct Cubic {
7777
config: Arc<CubicConfig>,
78-
state: State,
7978
current_mtu: u64,
79+
state: State,
80+
/// Copy of the controller state to restore when a spurious congestion event is detected.
81+
pre_congestion_state: Option<State>,
8082
}
8183

8284
impl Cubic {
@@ -89,6 +91,7 @@ impl Cubic {
8991
..Default::default()
9092
},
9193
current_mtu: current_mtu as u64,
94+
pre_congestion_state: None,
9295
config,
9396
}
9497
}
@@ -176,6 +179,7 @@ impl Controller for Cubic {
176179
now: Instant,
177180
sent: Instant,
178181
is_persistent_congestion: bool,
182+
is_ecn: bool,
179183
_lost_bytes: u64,
180184
) {
181185
if self
@@ -187,6 +191,11 @@ impl Controller for Cubic {
187191
return;
188192
}
189193

194+
// Save state in case this event ends up being spurious
195+
if !is_ecn {
196+
self.pre_congestion_state = Some(self.state.clone());
197+
}
198+
190199
self.state.recovery_start_time = Some(now);
191200

192201
// Fast convergence
@@ -221,6 +230,14 @@ impl Controller for Cubic {
221230
}
222231
}
223232

233+
fn on_spurious_congestion_event(&mut self) {
234+
if let Some(prior_state) = self.pre_congestion_state.take() {
235+
if self.state.window < prior_state.window {
236+
self.state = prior_state;
237+
}
238+
}
239+
}
240+
224241
fn on_mtu_update(&mut self, new_mtu: u16) {
225242
self.current_mtu = new_mtu as u64;
226243
self.state.window = self.state.window.max(self.minimum_window());

quinn-proto/src/congestion/new_reno.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ impl Controller for NewReno {
8787
now: Instant,
8888
sent: Instant,
8989
is_persistent_congestion: bool,
90+
_is_ecn: bool,
9091
_lost_bytes: u64,
9192
) {
9293
if sent <= self.recovery_start_time {

quinn-proto/src/connection/mod.rs

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use crate::{
2222
cid_queue::CidQueue,
2323
coding::BufMutExt,
2424
config::{ServerConfig, TransportConfig},
25+
connection::spaces::LostPacket,
2526
crypto::{self, KeyPair, Keys, PacketKey},
2627
frame::{self, Close, Datagram, FrameStruct, NewConnectionId, NewToken},
2728
packet::{
@@ -1461,6 +1462,10 @@ impl Connection {
14611462
}
14621463
};
14631464

1465+
if self.detect_spurious_loss(&ack, space) {
1466+
self.path.congestion.on_spurious_congestion_event();
1467+
}
1468+
14641469
// Avoid DoS from unreasonably huge ack ranges by filtering out just the new acks.
14651470
let mut newly_acked = ArrayRangeSet::new();
14661471
for range in ack.iter() {
@@ -1555,6 +1560,43 @@ impl Connection {
15551560
Ok(())
15561561
}
15571562

1563+
fn detect_spurious_loss(&mut self, ack: &frame::Ack, space: SpaceId) -> bool {
1564+
let lost_packets = &mut self.spaces[space].lost_packets;
1565+
1566+
if lost_packets.is_empty() {
1567+
return false;
1568+
}
1569+
1570+
for range in ack.iter() {
1571+
let spurious_losses: Vec<u64> = lost_packets
1572+
.range(range.clone())
1573+
.map(|(pn, _info)| pn)
1574+
.copied()
1575+
.collect();
1576+
1577+
for pn in spurious_losses {
1578+
lost_packets.remove(&pn);
1579+
}
1580+
}
1581+
1582+
// If this ACK frame acknowledged all deemed lost packets,
1583+
// then we have raised a spurious congestion event in the past.
1584+
// We cannot conclude when there are remaining packets,
1585+
// but future ACK frames might indicate a spurious loss detection.
1586+
lost_packets.is_empty()
1587+
}
1588+
1589+
/// Drain lost packets that we reasonably think will never arrive
1590+
///
1591+
/// The current criterion is copied from `msquic`:
1592+
/// discard packets that were sent earlier than 2 probe timeouts ago.
1593+
fn drain_lost_packets(&mut self, now: Instant, space: SpaceId) {
1594+
let two_pto = 2 * self.path.rtt.pto_base();
1595+
1596+
let lost_packets = &mut self.spaces[space].lost_packets;
1597+
lost_packets.retain(|_pn, info| now.saturating_duration_since(info.time_sent) <= two_pto);
1598+
}
1599+
15581600
/// Process a new ECN block from an in-order ACK
15591601
fn process_ecn(
15601602
&mut self,
@@ -1577,7 +1619,7 @@ impl Connection {
15771619
self.stats.path.congestion_events += 1;
15781620
self.path
15791621
.congestion
1580-
.on_congestion_event(now, largest_sent_time, false, 0);
1622+
.on_congestion_event(now, largest_sent_time, false, true, 0);
15811623
}
15821624
}
15831625
}
@@ -1735,6 +1777,8 @@ impl Connection {
17351777
prev_packet = Some(packet);
17361778
}
17371779

1780+
self.drain_lost_packets(now, pn_space);
1781+
17381782
// OnPacketsLost
17391783
if let Some(largest_lost) = lost_packets.last().cloned() {
17401784
let old_bytes_in_flight = self.path.in_flight.bytes;
@@ -1756,12 +1800,20 @@ impl Connection {
17561800
now,
17571801
self.orig_rem_cid,
17581802
);
1803+
17591804
self.remove_in_flight(&info);
17601805
for frame in info.stream_frames {
17611806
self.streams.retransmit(frame);
17621807
}
17631808
self.spaces[pn_space].pending |= info.retransmits;
17641809
self.path.mtud.on_non_probe_lost(packet, info.size);
1810+
1811+
self.spaces[pn_space].lost_packets.insert(
1812+
packet,
1813+
LostPacket {
1814+
time_sent: info.time_sent,
1815+
},
1816+
);
17651817
}
17661818

17671819
if self.path.mtud.black_hole_detected(now) {
@@ -1783,6 +1835,7 @@ impl Connection {
17831835
now,
17841836
largest_lost_sent,
17851837
in_persistent_congestion,
1838+
false,
17861839
size_of_lost_packets,
17871840
);
17881841
}

quinn-proto/src/connection/spaces.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ pub(super) struct PacketSpace {
3939
/// Transmitted but not acked
4040
// We use a BTreeMap here so we can efficiently query by range on ACK and for loss detection
4141
pub(super) sent_packets: BTreeMap<u64, SentPacket>,
42+
/// Packets that were deemed lost
43+
// Older packets are regularly removed in `Connection::drain_lost_packets`.
44+
pub(super) lost_packets: BTreeMap<u64, LostPacket>,
4245
/// Number of explicit congestion notification codepoints seen on incoming packets
4346
pub(super) ecn_counters: frame::EcnCounts,
4447
/// Recent ECN counters sent by the peer in ACK frames
@@ -84,6 +87,7 @@ impl PacketSpace {
8487
largest_ack_eliciting_sent: 0,
8588
unacked_non_ack_eliciting_tail: 0,
8689
sent_packets: BTreeMap::new(),
90+
lost_packets: BTreeMap::new(),
8791
ecn_counters: frame::EcnCounts::ZERO,
8892
ecn_feedback: frame::EcnCounts::ZERO,
8993

@@ -302,6 +306,13 @@ pub(super) struct SentPacket {
302306
pub(super) stream_frames: frame::StreamMetaVec,
303307
}
304308

309+
/// Represents one or more packets that are deemed lost.
310+
#[derive(Debug)]
311+
pub(super) struct LostPacket {
312+
/// The time the packet was sent.
313+
pub(super) time_sent: Instant,
314+
}
315+
305316
/// Retransmittable data queue
306317
#[allow(unreachable_pub)] // fuzzing only
307318
#[derive(Debug, Default, Clone)]

0 commit comments

Comments
 (0)