From d2914f5ef267b0f0adb7db21f51c4c4b23e3c6e9 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Tue, 22 Oct 2024 09:40:17 -0500 Subject: [PATCH 1/7] health: fix spurious warning about DERP home region '0' Updates #13650 Change-Id: I6b0f165f66da3f881a4caa25d2d9936dc2a7f22c Signed-off-by: Brad Fitzpatrick (cherry picked from commit ae5bc88ebea2f96f67e54ba6886c63ee0af14b54) --- health/health.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/health/health.go b/health/health.go index 216535d17c484..3a23a126b4deb 100644 --- a/health/health.go +++ b/health/health.go @@ -1051,11 +1051,15 @@ func (t *Tracker) updateBuiltinWarnablesLocked() { ArgDuration: d.Round(time.Second).String(), }) } - } else { + } else if homeDERP != 0 { t.setUnhealthyLocked(noDERPConnectionWarnable, Args{ ArgDERPRegionID: fmt.Sprint(homeDERP), ArgDERPRegionName: t.derpRegionNameLocked(homeDERP), }) + } else { + // No DERP home yet determined yet. There's probably some + // other problem or things are just starting up. + t.setHealthyLocked(noDERPConnectionWarnable) } if !t.ipnWantRunning { From b73831b2b5385cd092aed4e397bc15a4e8972b72 Mon Sep 17 00:00:00 2001 From: Andrea Gottardo Date: Thu, 31 Oct 2024 12:00:34 -0700 Subject: [PATCH 2/7] net/sockstats: prevent crash in setNetMon (#13985) (cherry picked from commit 6985369479db2c9d5bacccbde6d66630a81eb1ab) --- net/sockstats/sockstats_tsgo.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/sockstats/sockstats_tsgo.go b/net/sockstats/sockstats_tsgo.go index af691302f8be8..fec9ec3b0dad2 100644 --- a/net/sockstats/sockstats_tsgo.go +++ b/net/sockstats/sockstats_tsgo.go @@ -279,7 +279,13 @@ func setNetMon(netMon *netmon.Monitor) { if ifName == "" { return } - ifIndex := state.Interface[ifName].Index + // DefaultRouteInterface and Interface are gathered at different points in time. + // Check for existence first, to avoid a nil pointer dereference. + iface, ok := state.Interface[ifName] + if !ok { + return + } + ifIndex := iface.Index sockStats.mu.Lock() defer sockStats.mu.Unlock() // Ignore changes to unknown interfaces -- it would require From 52807386902b2102db0865d72080bd3d4c20f4d6 Mon Sep 17 00:00:00 2001 From: James Tucker Date: Wed, 30 Oct 2024 11:31:36 -0700 Subject: [PATCH 3/7] net/netcheck: ensure prior preferred DERP is always in netchecks In an environment with unstable latency, such as upstream bufferbloat, there are cases where a full netcheck could drop the prior preferred DERP (likely home DERP) from future netcheck probe plans. This will then likely result in a home DERP having a missing sample on the next incremental netcheck, ultimately resulting in a home DERP move. This change does not fix our overall response to highly unstable latency, but it is an incremental improvement to prevent single spurious samples during a full netcheck from alone triggering a flapping condition, as now the prior changes to include historical latency will still provide the desired resistance, and the home DERP should not move unless latency is consistently worse over a 5 minute period. Note that there is a nomenclature and semantics issue remaining in the difference between a report preferred DERP and a home DERP. A report preferred DERP is aspirational, it is what will be picked as a home DERP if a home DERP connection needs to be established. A nodes home DERP may be different than a recent preferred DERP, in which case a lot of netcheck logic is fallible. In future enhancements much of the DERP move logic should move to consider the home DERP, rather than recent report preferred DERP. Updates #8603 Updates #13969 Signed-off-by: James Tucker --- net/netcheck/netcheck.go | 68 +++++++++++++++++++++++++++-------- net/netcheck/netcheck_test.go | 42 ++++++++++++++++++++-- 2 files changed, 93 insertions(+), 17 deletions(-) diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index bebf4c9b05461..2149ac91a8841 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -391,10 +391,11 @@ type probePlan map[string][]probe // sortRegions returns the regions of dm first sorted // from fastest to slowest (based on the 'last' report), // end in regions that have no data. -func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion) { +func sortRegions(dm *tailcfg.DERPMap, last *Report, preferredDERP int) (prev []*tailcfg.DERPRegion) { prev = make([]*tailcfg.DERPRegion, 0, len(dm.Regions)) for _, reg := range dm.Regions { - if reg.Avoid { + // include an otherwise avoid region if it is the current preferred region + if reg.Avoid && reg.RegionID != preferredDERP { continue } prev = append(prev, reg) @@ -419,9 +420,19 @@ func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion) // a full report, all regions are scanned.) const numIncrementalRegions = 3 -// makeProbePlan generates the probe plan for a DERPMap, given the most -// recent report and whether IPv6 is configured on an interface. -func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (plan probePlan) { +// makeProbePlan generates the probe plan for a DERPMap, given the most recent +// report and the current home DERP. preferredDERP is passed independently of +// last (report) because last is currently nil'd to indicate a desire for a full +// netcheck. +// +// TODO(raggi,jwhited): refactor the callers and this function to be more clear +// about full vs. incremental netchecks, and remove the need for the history +// hiding. This was avoided in an incremental change due to exactly this kind of +// distant coupling. +// TODO(raggi): change from "preferred DERP" from a historical report to "home +// DERP" as in what DERP is the current home connection, this would further +// reduce flap events. +func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, preferredDERP int) (plan probePlan) { if last == nil || len(last.RegionLatency) == 0 { return makeProbePlanInitial(dm, ifState) } @@ -432,9 +443,34 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl had4 := len(last.RegionV4Latency) > 0 had6 := len(last.RegionV6Latency) > 0 hadBoth := have6if && had4 && had6 - for ri, reg := range sortRegions(dm, last) { - if ri == numIncrementalRegions { - break + // #13969 ensure that the home region is always probed. + // If a netcheck has unstable latency, such as a user with large amounts of + // bufferbloat or a highly congested connection, there are cases where a full + // netcheck may observe a one-off high latency to the current home DERP. Prior + // to the forced inclusion of the home DERP, this would result in an + // incremental netcheck following such an event to cause a home DERP move, with + // restoration back to the home DERP on the next full netcheck ~5 minutes later + // - which is highly disruptive when it causes shifts in geo routed subnet + // routers. By always including the home DERP in the incremental netcheck, we + // ensure that the home DERP is always probed, even if it observed a recenet + // poor latency sample. This inclusion enables the latency history checks in + // home DERP selection to still take effect. + // planContainsHome indicates whether the home DERP has been added to the probePlan, + // if there is no prior home, then there's no home to additionally include. + planContainsHome := preferredDERP == 0 + for ri, reg := range sortRegions(dm, last, preferredDERP) { + regIsHome := reg.RegionID == preferredDERP + if ri >= numIncrementalRegions { + // planned at least numIncrementalRegions regions and that includes the + // last home region (or there was none), plan complete. + if planContainsHome { + break + } + // planned at least numIncrementalRegions regions, but not the home region, + // check if this is the home region, if not, skip it. + if !regIsHome { + continue + } } var p4, p6 []probe do4 := have4if @@ -445,7 +481,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl tries := 1 isFastestTwo := ri < 2 - if isFastestTwo { + if isFastestTwo || regIsHome { tries = 2 } else if hadBoth { // For dual stack machines, make the 3rd & slower nodes alternate @@ -456,14 +492,15 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl do4, do6 = false, true } } - if !isFastestTwo && !had6 { + if !regIsHome && !isFastestTwo && !had6 { do6 = false } - if reg.RegionID == last.PreferredDERP { + if regIsHome { // But if we already had a DERP home, try extra hard to // make sure it's there so we don't flip flop around. tries = 4 + planContainsHome = true } for try := 0; try < tries; try++ { @@ -788,9 +825,10 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe c.curState = rs last := c.last - // Even if we're doing a non-incremental update, we may want to try our - // preferred DERP region for captive portal detection. Save that, if we - // have it. + // Extract preferredDERP from the last report, if available. This will be used + // in captive portal detection and DERP flapping suppression. Ideally this would + // be the current active home DERP rather than the last report preferred DERP, + // but only the latter is presently available. var preferredDERP int if last != nil { preferredDERP = last.PreferredDERP @@ -847,7 +885,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe var plan probePlan if opts == nil || !opts.OnlyTCP443 { - plan = makeProbePlan(dm, ifState, last) + plan = makeProbePlan(dm, ifState, last, preferredDERP) } // If we're doing a full probe, also check for a captive portal. We diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 02076f8d468e1..b1256be10b059 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -576,6 +576,40 @@ func TestMakeProbePlan(t *testing.T) { "region-3-v4": []probe{p("3a", 4)}, }, }, + { + // #13969: ensure that the prior/current home region is always included in + // probe plans, so that we don't flap between regions due to a single major + // netcheck having excluded the home region due to a spuriously high sample. + name: "ensure_home_region_inclusion", + dm: basicMap, + have6if: true, + last: &Report{ + RegionLatency: map[int]time.Duration{ + 1: 50 * time.Millisecond, + 2: 20 * time.Millisecond, + 3: 30 * time.Millisecond, + 4: 40 * time.Millisecond, + }, + RegionV4Latency: map[int]time.Duration{ + 1: 50 * time.Millisecond, + 2: 20 * time.Millisecond, + }, + RegionV6Latency: map[int]time.Duration{ + 3: 30 * time.Millisecond, + 4: 40 * time.Millisecond, + }, + PreferredDERP: 1, + }, + want: probePlan{ + "region-1-v4": []probe{p("1a", 4), p("1a", 4, 60*ms), p("1a", 4, 220*ms), p("1a", 4, 330*ms)}, + "region-1-v6": []probe{p("1a", 6), p("1a", 6, 60*ms), p("1a", 6, 220*ms), p("1a", 6, 330*ms)}, + "region-2-v4": []probe{p("2a", 4), p("2b", 4, 24*ms)}, + "region-2-v6": []probe{p("2a", 6), p("2b", 6, 24*ms)}, + "region-3-v4": []probe{p("3a", 4), p("3b", 4, 36*ms)}, + "region-3-v6": []probe{p("3a", 6), p("3b", 6, 36*ms)}, + "region-4-v4": []probe{p("4a", 4)}, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -583,7 +617,11 @@ func TestMakeProbePlan(t *testing.T) { HaveV6: tt.have6if, HaveV4: !tt.no4, } - got := makeProbePlan(tt.dm, ifState, tt.last) + preferredDERP := 0 + if tt.last != nil { + preferredDERP = tt.last.PreferredDERP + } + got := makeProbePlan(tt.dm, ifState, tt.last, preferredDERP) if !reflect.DeepEqual(got, tt.want) { t.Errorf("unexpected plan; got:\n%v\nwant:\n%v\n", got, tt.want) } @@ -756,7 +794,7 @@ func TestSortRegions(t *testing.T) { report.RegionLatency[3] = time.Second * time.Duration(6) report.RegionLatency[4] = time.Second * time.Duration(0) report.RegionLatency[5] = time.Second * time.Duration(2) - sortedMap := sortRegions(unsortedMap, report) + sortedMap := sortRegions(unsortedMap, report, 0) // Sorting by latency this should result in rid: 5, 2, 1, 3 // rid 4 with latency 0 should be at the end From 0472936f56f12c7d86a44c8ccaae84bbb7df590c Mon Sep 17 00:00:00 2001 From: Tim Walters Date: Wed, 23 Oct 2024 14:27:00 -0500 Subject: [PATCH 4/7] wgengine/magicsock: log home DERP changes with latency This adds additional logging on DERP home changes to allow better troubleshooting. Updates tailscale/corp#18095 Signed-off-by: Tim Walters (cherry picked from commit 856ea2376b59df8f84f96119559d4273588a04ac) --- wgengine/magicsock/derp.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/wgengine/magicsock/derp.go b/wgengine/magicsock/derp.go index 69c5cbc90a5f4..1a42c3b610f84 100644 --- a/wgengine/magicsock/derp.go +++ b/wgengine/magicsock/derp.go @@ -158,10 +158,10 @@ func (c *Conn) maybeSetNearestDERP(report *netcheck.Report) (preferredDERP int) } else { connectedToControl = c.health.GetInPollNetMap() } + c.mu.Lock() + myDerp := c.myDerp + c.mu.Unlock() if !connectedToControl { - c.mu.Lock() - myDerp := c.myDerp - c.mu.Unlock() if myDerp != 0 { metricDERPHomeNoChangeNoControl.Add(1) return myDerp @@ -178,6 +178,11 @@ func (c *Conn) maybeSetNearestDERP(report *netcheck.Report) (preferredDERP int) // one. preferredDERP = c.pickDERPFallback() } + if preferredDERP != myDerp { + c.logf( + "magicsock: home DERP changing from derp-%d [%dms] to derp-%d [%dms]", + c.myDerp, report.RegionLatency[myDerp].Milliseconds(), preferredDERP, report.RegionLatency[preferredDERP].Milliseconds()) + } if !c.setNearestDERP(preferredDERP) { preferredDERP = 0 } From 666c9618f7276688508798207f0d5e03d27448e5 Mon Sep 17 00:00:00 2001 From: Andrea Gottardo Date: Fri, 1 Nov 2024 12:25:37 -0700 Subject: [PATCH 5/7] VERSION.txt: this is v1.76.4 Signed-off-by: Andrea Gottardo --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index d729e2953dbfe..0ffefc3f90751 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.76.3 +1.76.4 From dda46031678168a5019da7b3de39c1fa4a774a0b Mon Sep 17 00:00:00 2001 From: Andrea Gottardo Date: Fri, 1 Nov 2024 15:07:58 -0700 Subject: [PATCH 6/7] VERSION.txt: this is v1.76.5 Signed-off-by: Andrea Gottardo --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 0ffefc3f90751..3592eea3cf25c 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.76.4 +1.76.5 From 1edcf9d466ceafedd2816db1a24d5ba4b0b18a5b Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Mon, 4 Nov 2024 15:07:25 -0500 Subject: [PATCH 7/7] VERSION.txt: this is v1.76.6 Signed-off-by: Jonathan Nobels --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 3592eea3cf25c..294b29f8a603f 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.76.5 +1.76.6