From 32d7afd1472c3cce509a3455b40a575540eac780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bram=20Bonn=C3=A9?= Date: Fri, 30 Apr 2021 11:50:19 +0200 Subject: [PATCH 001/116] ANDROID: selinux: modify RTM_GETNEIGH{TBL} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map the permission gating RTM_GETNEIGH/RTM_GETNEIGHTBL messages to a new permission so that it can be distinguished from the other netlink route permissions in selinux policy. The new permission is triggered by a flag set in system images T and up. This change is intended to be backported to all kernels that a T system image can run on top of. Bug: 171572148 Test: atest NetworkInterfaceTest Test: atest CtsSelinuxTargetSdkCurrentTestCases Test: atest bionic-unit-tests-static Test: On Cuttlefish, run combinations of: - Policy bit set or omitted (see https://r.android.com/1701847) - This patch applied or omitted - App having nlmsg_readneigh permission or not Verify that only the combination of this patch + the policy bit being set + the app not having the nlmsg_readneigh permission prevents the app from sending RTM_GETNEIGH messages. Change-Id: I4bcfce4decb34ea9388eeedfc4be67403de8a980 Signed-off-by: Bram Bonné (cherry picked from commit fac07550bdac9adea0dbe3edbdbec7a9a690a178) --- security/selinux/include/classmap.h | 3 ++- security/selinux/include/security.h | 9 +++++++++ security/selinux/nlmsgtab.c | 24 ++++++++++++++++++++---- security/selinux/ss/policydb.c | 4 ++++ security/selinux/ss/policydb.h | 2 ++ security/selinux/ss/services.c | 1 + 6 files changed, 38 insertions(+), 5 deletions(-) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 34631690b5f9..b52c7e1d7a84 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -115,7 +115,8 @@ struct security_class_mapping secclass_map[] = { { COMMON_IPC_PERMS, NULL } }, { "netlink_route_socket", { COMMON_SOCK_PERMS, - "nlmsg_read", "nlmsg_write", "nlmsg_readpriv", NULL } }, + "nlmsg_read", "nlmsg_write", "nlmsg_readpriv", "nlmsg_getneigh", + NULL } }, { "netlink_tcpdiag_socket", { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } }, diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index f64e33b23cd9..d9a64b3839c6 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -105,6 +105,8 @@ struct selinux_state { bool initialized; bool policycap[__POLICYDB_CAPABILITY_MAX]; bool android_netlink_route; + bool android_netlink_getneigh; + struct selinux_avc *avc; struct selinux_ss *ss; }; @@ -184,6 +186,13 @@ static inline bool selinux_android_nlroute_getlink(void) return state->android_netlink_route; } +static inline bool selinux_android_nlroute_getneigh(void) +{ + struct selinux_state *state = &selinux_state; + + return state->android_netlink_getneigh; +} + int security_mls_enabled(struct selinux_state *state); int security_load_policy(struct selinux_state *state, void *data, size_t len); diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 6a93edf01cfb..90561f657346 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -196,12 +196,12 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) return err; } -static void nlmsg_set_getlink_perm(u32 perm) +static void nlmsg_set_perm_for_type(u32 perm, u16 type) { int i; for (i = 0; i < ARRAY_SIZE(nlmsg_route_perms); i++) { - if (nlmsg_route_perms[i].nlmsg_type == RTM_GETLINK) { + if (nlmsg_route_perms[i].nlmsg_type == type) { nlmsg_route_perms[i].perm = perm; break; } @@ -211,11 +211,27 @@ static void nlmsg_set_getlink_perm(u32 perm) /** * Use nlmsg_readpriv as the permission for RTM_GETLINK messages if the * netlink_route_getlink policy capability is set. Otherwise use nlmsg_read. + * Similarly, use nlmsg_getneigh for RTM_GETNEIGH and RTM_GETNEIGHTBL if the + * netlink_route_getneigh policy capability is set. Otherwise use nlmsg_read. */ void selinux_nlmsg_init(void) { if (selinux_android_nlroute_getlink()) - nlmsg_set_getlink_perm(NETLINK_ROUTE_SOCKET__NLMSG_READPRIV); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READPRIV, + RTM_GETLINK); else - nlmsg_set_getlink_perm(NETLINK_ROUTE_SOCKET__NLMSG_READ); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETLINK); + + if (selinux_android_nlroute_getneigh()) { + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_GETNEIGH, + RTM_GETNEIGH); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_GETNEIGH, + RTM_GETNEIGHTBL); + } else { + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETNEIGH); + nlmsg_set_perm_for_type(NETLINK_ROUTE_SOCKET__NLMSG_READ, + RTM_GETNEIGHTBL); + } } diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 2472b2a66f70..0986c2363e96 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -2404,6 +2404,10 @@ int policydb_read(struct policydb *p, void *fp) p->android_netlink_route = 1; } + if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH)) { + p->android_netlink_getneigh = 1; + } + if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_read(&p->policycaps, fp); if (rc) diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index dbb0ed57ed8b..9423952faf7b 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -239,6 +239,7 @@ struct genfs { struct policydb { int mls_enabled; int android_netlink_route; + int android_netlink_getneigh; /* symbol tables */ struct symtab symtab[SYM_NUM]; @@ -326,6 +327,7 @@ extern int policydb_write(struct policydb *p, void *fp); #define POLICYDB_CONFIG_MLS 1 #define POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE (1 << 31) +#define POLICYDB_CONFIG_ANDROID_NETLINK_GETNEIGH (1 << 30) /* the config flags related to unknown classes/perms are bits 2 and 3 */ #define REJECT_UNKNOWN 0x00000002 diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 289a9f5672a4..09488e27534b 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2109,6 +2109,7 @@ static void security_load_policycaps(struct selinux_state *state) } state->android_netlink_route = p->android_netlink_route; + state->android_netlink_getneigh = p->android_netlink_getneigh; selinux_nlmsg_init(); } From 921be923551be5d7849bdc488617baacd39e2e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Wed, 16 Jun 2021 17:09:51 -0700 Subject: [PATCH 002/116] FROMGIT: bpf: Do not change gso_size during bpf_skb_change_proto() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is technically a backwards incompatible change in behaviour, but I'm going to argue that it is very unlikely to break things, and likely to fix *far* more then it breaks. In no particular order, various reasons follow: (a) I've long had a bug assigned to myself to debug a super rare kernel crash on Android Pixel phones which can (per stacktrace) be traced back to BPF clat IPv6 to IPv4 protocol conversion causing some sort of ugly failure much later on during transmit deep in the GSO engine, AFAICT precisely because of this change to gso_size, though I've never been able to manually reproduce it. I believe it may be related to the particular network offload support of attached USB ethernet dongle being used for tethering off of an IPv6-only cellular connection. The reason might be we end up with more segments than max permitted, or with a GSO packet with only one segment... (either way we break some assumption and hit a BUG_ON) (b) There is no check that the gso_size is > 20 when reducing it by 20, so we might end up with a negative (or underflowing) gso_size or a gso_size of 0. This can't possibly be good. Indeed this is probably somehow exploitable (or at least can result in a kernel crash) by delivering crafted packets and perhaps triggering an infinite loop or a divide by zero... As a reminder: gso_size (MSS) is related to MTU, but not directly derived from it: gso_size/MSS may be significantly smaller then one would get by deriving from local MTU. And on some NICs (which do loose MTU checking on receive, it may even potentially be larger, for example my work pc with 1500 MTU can receive 1520 byte frames [and sometimes does due to bugs in a vendor plat46 implementation]). Indeed even just going from 21 to 1 is potentially problematic because it increases the number of segments by a factor of 21 (think DoS, or some other crash due to too many segments). (c) It's always safe to not increase the gso_size, because it doesn't result in the max packet size increasing. So the skb_increase_gso_size() call was always unnecessary for correctness (and outright undesirable, see later). As such the only part which is potentially dangerous (ie. could cause backwards compatibility issues) is the removal of the skb_decrease_gso_size() call. (d) If the packets are ultimately destined to the local device, then there is absolutely no benefit to playing around with gso_size. It only matters if the packets will egress the device. ie. we're either forwarding, or transmitting from the device. (e) This logic only triggers for packets which are GSO. It does not trigger for skbs which are not GSO. It will not convert a non-GSO MTU sized packet into a GSO packet (and you don't even know what the MTU is, so you can't even fix it). As such your transmit path must *already* be able to handle an MTU 20 bytes larger then your receive path (for IPv4 to IPv6 translation) - and indeed 28 bytes larger due to IPv4 fragments. Thus removing the skb_decrease_gso_size() call doesn't actually increase the size of the packets your transmit side must be able to handle. ie. to handle non-GSO max-MTU packets, the IPv4/IPv6 device/ route MTUs must already be set correctly. Since for example with an IPv4 egress MTU of 1500, IPv4 to IPv6 translation will already build 1520 byte IPv6 frames, so you need a 1520 byte device MTU. This means if your IPv6 device's egress MTU is 1280, your IPv4 route must be 1260 (and actually 1252, because of the need to handle fragments). This is to handle normal non-GSO packets. Thus the reduction is simply not needed for GSO packets, because when they're correctly built, they will already be the right size. (f) TSO/GSO should be able to exactly undo GRO: the number of packets (TCP segments) should not be modified, so that TCP's MSS counting works correctly (this matters for congestion control). If protocol conversion changes the gso_size, then the number of TCP segments may increase or decrease. Packet loss after protocol conversion can result in partial loss of MSS segments that the sender sent. How's the sending TCP stack going to react to receiving ACKs/SACKs in the middle of the segments it sent? (g) skb_{decrease,increase}_gso_size() are already no-ops for GSO_BY_FRAGS case (besides triggering WARN_ON_ONCE). This means you already cannot guarantee that gso_size (and thus resulting packet MTU) is changed. ie. you must assume it won't be changed. (h) changing gso_size is outright buggy for UDP GSO packets, where framing matters (I believe that's also the case for SCTP, but it's already excluded by [g]). So the only remaining case is TCP, which also doesn't want it (see [f]). (i) see also the reasoning on the previous attempt at fixing this (commit fa7b83bf3b156c767f3e4a25bbf3817b08f3ff8e) which shows that the current behaviour causes TCP packet loss: In the forwarding path GRO -> BPF 6 to 4 -> GSO for TCP traffic, the coalesced packet payload can be > MSS, but < MSS + 20. bpf_skb_proto_6_to_4() will upgrade the MSS and it can be > the payload length. After then tcp_gso_segment checks for the payload length if it is <= MSS. The condition is causing the packet to be dropped. tcp_gso_segment(): [...] mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; [...] Thus changing the gso_size is simply a very bad idea. Increasing is unnecessary and buggy, and decreasing can go negative. Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Signed-off-by: Maciej Żenczykowski Signed-off-by: Daniel Borkmann Cc: Dongseok Yi Cc: Willem de Bruijn Link: https://lore.kernel.org/bpf/CANP3RGfjLikQ6dg=YpBU0OeHvyv7JOki7CyOUS9modaXAi-9vQ@mail.gmail.com Link: https://lore.kernel.org/bpf/20210617000953.2787453-2-zenczykowski@gmail.com (cherry picked from commit 364745fbe981a4370f50274475da4675661104df https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=364745fbe981a4370f50274475da4675661104df ) Test: builds, TreeHugger Bug: 188690383 Signed-off-by: Maciej Żenczykowski Change-Id: I0ef3174cbd3caaa42d5779334a9c0bfdc9ab81f5 --- net/core/filter.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 98b8057e2382..df2e40c10ec8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2097,8 +2097,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; } - /* Due to IPv6 header, MSS needs to be downgraded. */ - skb_shinfo(skb)->gso_size -= len_diff; /* Header must be checked, and gso_segs recomputed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; skb_shinfo(skb)->gso_segs = 0; @@ -2133,8 +2131,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } - /* Due to IPv4 header, MSS can be upgraded. */ - skb_shinfo(skb)->gso_size += len_diff; /* Header must be checked, and gso_segs recomputed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; skb_shinfo(skb)->gso_segs = 0; From c6998ccfefa652bac3f9b236821e392af43efa1e Mon Sep 17 00:00:00 2001 From: Dan Robertson Date: Fri, 23 Apr 2021 00:02:13 -0400 Subject: [PATCH 003/116] net: ieee802154: fix null deref in parse dev addr [ Upstream commit 9fdd04918a452980631ecc499317881c1d120b70 ] Fix a logic error that could result in a null deref if the user sets the mode incorrectly for the given addr type. Signed-off-by: Dan Robertson Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20210423040214.15438-2-dan@dlrobertson.com Signed-off-by: Stefan Schmidt Signed-off-by: Sasha Levin --- net/ieee802154/nl802154.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index b1c55db73764..6d4c71a52b6b 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1315,19 +1315,20 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; - if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || - !attrs[NL802154_DEV_ADDR_ATTR_MODE] || - !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || - attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE]) return -EINVAL; addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); switch (addr->mode) { case NL802154_DEV_ADDR_SHORT: + if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT]) + return -EINVAL; addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); break; case NL802154_DEV_ADDR_EXTENDED: + if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]) + return -EINVAL; addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); break; default: From ca59ef64aebe65c62517aecb19d956addd507f43 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 15 Apr 2021 11:52:31 -0700 Subject: [PATCH 004/116] HID: hid-sensor-hub: Return error for hid_set_field() failure [ Upstream commit edb032033da0dc850f6e7740fa1023c73195bc89 ] In the function sensor_hub_set_feature(), return error when hid_set_field() fails. Signed-off-by: Srinivas Pandruvada Acked-by: Jonathan Cameron Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-sensor-hub.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index aa078c1dad14..6c7e12d8e7d9 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -223,16 +223,21 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id, buffer_size = buffer_size / sizeof(__s32); if (buffer_size) { for (i = 0; i < buffer_size; ++i) { - hid_set_field(report->field[field_index], i, - (__force __s32)cpu_to_le32(*buf32)); + ret = hid_set_field(report->field[field_index], i, + (__force __s32)cpu_to_le32(*buf32)); + if (ret) + goto done_proc; + ++buf32; } } if (remaining_bytes) { value = 0; memcpy(&value, (u8 *)buf32, remaining_bytes); - hid_set_field(report->field[field_index], i, - (__force __s32)cpu_to_le32(value)); + ret = hid_set_field(report->field[field_index], i, + (__force __s32)cpu_to_le32(value)); + if (ret) + goto done_proc; } hid_hw_request(hsdev->hdev, report, HID_REQ_SET_REPORT); hid_hw_wait(hsdev->hdev); From 43ab41d973e3e3be389a2819477d8054b9e61903 Mon Sep 17 00:00:00 2001 From: Mark Bolhuis Date: Mon, 3 May 2021 17:39:38 +0100 Subject: [PATCH 005/116] HID: Add BUS_VIRTUAL to hid_connect logging [ Upstream commit 48e33befe61a7d407753c53d1a06fc8d6b5dab80 ] Add BUS_VIRTUAL to hid_connect logging since it's a valid hid bus type and it should not print Signed-off-by: Mark Bolhuis Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 71ee1267d2ef..381ab96c1e38 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1824,6 +1824,9 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask) case BUS_I2C: bus = "I2C"; break; + case BUS_VIRTUAL: + bus = "VIRTUAL"; + break; default: bus = ""; } From 8c064eece9a51856f3f275104520c7e3017fc5c0 Mon Sep 17 00:00:00 2001 From: Anirudh Rayabharam Date: Sun, 25 Apr 2021 23:03:53 +0530 Subject: [PATCH 006/116] HID: usbhid: fix info leak in hid_submit_ctrl [ Upstream commit 6be388f4a35d2ce5ef7dbf635a8964a5da7f799f ] In hid_submit_ctrl(), the way of calculating the report length doesn't take into account that report->size can be zero. When running the syzkaller reproducer, a report of size 0 causes hid_submit_ctrl) to calculate transfer_buffer_length as 16384. When this urb is passed to the usb core layer, KMSAN reports an info leak of 16384 bytes. To fix this, first modify hid_report_len() to account for the zero report size case by using DIV_ROUND_UP for the division. Then, call it from hid_submit_ctrl(). Reported-by: syzbot+7c2bb71996f95a82524c@syzkaller.appspotmail.com Signed-off-by: Anirudh Rayabharam Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/usbhid/hid-core.c | 2 +- include/linux/hid.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 98916fb4191a..46b8f4c353de 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -373,7 +373,7 @@ static int hid_submit_ctrl(struct hid_device *hid) raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report; dir = usbhid->ctrl[usbhid->ctrltail].dir; - len = ((report->size - 1) >> 3) + 1 + (report->id > 0); + len = hid_report_len(report); if (dir == USB_DIR_OUT) { usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0); usbhid->urbctrl->transfer_buffer_length = len; diff --git a/include/linux/hid.h b/include/linux/hid.h index d07fe33a9045..5a2c55ed33fa 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1114,8 +1114,7 @@ static inline void hid_hw_wait(struct hid_device *hdev) */ static inline u32 hid_report_len(struct hid_report *report) { - /* equivalent to DIV_ROUND_UP(report->size, 8) + !!(report->id > 0) */ - return ((report->size - 1) >> 3) + 1 + (report->id > 0); + return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, From 9b550cc32a85404f67f51ce3c47d5d8f80a5a9ca Mon Sep 17 00:00:00 2001 From: Yongqiang Liu Date: Thu, 1 Apr 2021 13:15:33 +0000 Subject: [PATCH 007/116] ARM: OMAP2+: Fix build warning when mmc_omap is not built [ Upstream commit 040ab72ee10ea88e1883ad143b3e2b77596abc31 ] GCC reports the following warning with W=1: arch/arm/mach-omap2/board-n8x0.c:325:19: warning: variable 'index' set but not used [-Wunused-but-set-variable] 325 | int bit, *openp, index; | ^~~~~ Fix this by moving CONFIG_MMC_OMAP to cover the rest codes in the n8x0_mmc_callback(). Signed-off-by: Yongqiang Liu Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- arch/arm/mach-omap2/board-n8x0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/board-n8x0.c b/arch/arm/mach-omap2/board-n8x0.c index 20f25539d572..47abea1475d4 100644 --- a/arch/arm/mach-omap2/board-n8x0.c +++ b/arch/arm/mach-omap2/board-n8x0.c @@ -325,6 +325,7 @@ static int n8x0_mmc_get_cover_state(struct device *dev, int slot) static void n8x0_mmc_callback(void *data, u8 card_mask) { +#ifdef CONFIG_MMC_OMAP int bit, *openp, index; if (board_is_n800()) { @@ -342,7 +343,6 @@ static void n8x0_mmc_callback(void *data, u8 card_mask) else *openp = 0; -#ifdef CONFIG_MMC_OMAP omap_mmc_notify_cover_event(mmc_device, index, *openp); #else pr_warn("MMC: notify cover event not available\n"); From b5496f3da6b5b6c91e6990ebbb243415e95f7f83 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 8 May 2021 11:14:48 +0800 Subject: [PATCH 008/116] HID: gt683r: add missing MODULE_DEVICE_TABLE [ Upstream commit a4b494099ad657f1cb85436d333cf38870ee95bc ] This patch adds missing MODULE_DEVICE_TABLE definition which generates correct modalias for automatic loading of this driver when it is built as an external module. Reported-by: Hulk Robot Signed-off-by: Bixuan Cui Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-gt683r.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-gt683r.c b/drivers/hid/hid-gt683r.c index a298fbd8db6b..8ca4c1baeda8 100644 --- a/drivers/hid/hid-gt683r.c +++ b/drivers/hid/hid-gt683r.c @@ -64,6 +64,7 @@ static const struct hid_device_id gt683r_led_id[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MSI, USB_DEVICE_ID_MSI_GT683R_LED_PANEL) }, { } }; +MODULE_DEVICE_TABLE(hid, gt683r_led_id); static void gt683r_brightness_set(struct led_classdev *led_cdev, enum led_brightness brightness) From 0364742decb0f02bc183404868b82896f7992595 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 18 May 2021 16:46:25 +0800 Subject: [PATCH 009/116] gfs2: Fix use-after-free in gfs2_glock_shrink_scan [ Upstream commit 1ab19c5de4c537ec0d9b21020395a5b5a6c059b2 ] The GLF_LRU flag is checked under lru_lock in gfs2_glock_remove_from_lru() to remove the glock from the lru list in __gfs2_glock_put(). On the shrink scan path, the same flag is cleared under lru_lock but because of cond_resched_lock(&lru_lock) in gfs2_dispose_glock_lru(), progress on the put side can be made without deleting the glock from the lru list. Keep GLF_LRU across the race window opened by cond_resched_lock(&lru_lock) to ensure correct behavior on both sides - clear GLF_LRU after list_del under lru_lock. Reported-by: syzbot Signed-off-by: Hillf Danton Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin --- fs/gfs2/glock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0a0dd3178483..be969f24ccf0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1456,6 +1456,7 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_add(&gl->gl_lru, &lru_list); @@ -1501,7 +1502,6 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); - clear_bit(GLF_LRU, &gl->gl_flags); freed++; continue; } From 376a4d9cbdb6f666083a6c37eaf63b50ad18704c Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 31 May 2021 14:13:26 +0200 Subject: [PATCH 010/116] scsi: target: core: Fix warning on realtime kernels [ Upstream commit 515da6f4295c2c42b8c54572cce3d2dd1167c41e ] On realtime kernels, spin_lock_irq*(spinlock_t) do not disable the interrupts, a call to irqs_disabled() will return false thus firing a warning in __transport_wait_for_tasks(). Remove the warning and also replace assert_spin_locked() with lockdep_assert_held() Link: https://lore.kernel.org/r/20210531121326.3649-1-mlombard@redhat.com Reviewed-by: Bart Van Assche Signed-off-by: Maurizio Lombardi Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/target_core_transport.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 0d0be7d8b9d6..852680e85921 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2966,9 +2966,7 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop, __releases(&cmd->t_state_lock) __acquires(&cmd->t_state_lock) { - - assert_spin_locked(&cmd->t_state_lock); - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_held(&cmd->t_state_lock); if (fabric_stop) cmd->transport_state |= CMD_T_FABRIC_STOP; From 46ce76b0bc3949b39e45eb19733712b946e5e3f3 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 1 Jun 2021 19:04:51 +0800 Subject: [PATCH 011/116] ethernet: myri10ge: Fix missing error code in myri10ge_probe() [ Upstream commit f336d0b93ae978f12c5e27199f828da89b91e56a ] The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'status'. Eliminate the follow smatch warning: drivers/net/ethernet/myricom/myri10ge/myri10ge.c:3818 myri10ge_probe() warn: missing error code 'status'. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index a0a555052d8c..1ac2bc75edb1 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -3853,6 +3853,7 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_err(&pdev->dev, "invalid sram_size %dB or board span %ldB\n", mgp->sram_size, mgp->board_span); + status = -EINVAL; goto abort_with_ioremap; } memcpy_fromio(mgp->eeprom_strings, From 41e4946a5b4405b264243d2119a0aee313d0c1af Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 May 2021 17:23:15 +0200 Subject: [PATCH 012/116] nvme-loop: reset queue count to 1 in nvme_loop_destroy_io_queues() [ Upstream commit a6c144f3d2e230f2b3ac5ed8c51e0f0391556197 ] The queue count is increased in nvme_loop_init_io_queues(), so we need to reset it to 1 at the end of nvme_loop_destroy_io_queues(). Otherwise the function is not re-entrant safe, and crash will happen during concurrent reset and remove calls. Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3388d2788fe0..5f33c3a9469b 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -322,6 +322,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); } + ctrl->ctrl.queue_count = 1; } static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) From 089970144e0390bd3a246b5305966045bca14891 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 May 2021 17:23:16 +0200 Subject: [PATCH 013/116] nvme-loop: clear NVME_LOOP_Q_LIVE when nvme_loop_configure_admin_queue() fails [ Upstream commit 1c5f8e882a05de5c011e8c3fbeceb0d1c590eb53 ] When the call to nvme_enable_ctrl() in nvme_loop_configure_admin_queue() fails the NVME_LOOP_Q_LIVE flag is not cleared. Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 5f33c3a9469b..963d8de932d1 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -430,6 +430,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) return 0; out_cleanup_queue: + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: blk_mq_free_tag_set(&ctrl->admin_tag_set); From 4b2e0ac757bc0deb1f1115ea84ebb6928bbd9c55 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 May 2021 17:23:17 +0200 Subject: [PATCH 014/116] nvme-loop: check for NVME_LOOP_Q_LIVE in nvme_loop_destroy_admin_queue() [ Upstream commit 4237de2f73a669e4f89ac0aa2b44fb1a1d9ec583 ] We need to check the NVME_LOOP_Q_LIVE flag in nvme_loop_destroy_admin_queue() to protect against duplicate invocations eg during concurrent reset and remove calls. Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 963d8de932d1..7a0a10777cd1 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -287,7 +287,8 @@ static const struct blk_mq_ops nvme_loop_admin_mq_ops = { static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) { - clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); + if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) + return; nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); From c97aafab4bfab12493b3eed6a3e0c74519bdd21b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 1 Jun 2021 18:38:41 -0700 Subject: [PATCH 015/116] net: ipconfig: Don't override command-line hostnames or domains [ Upstream commit b508d5fb69c2211a1b860fc058aafbefc3b3c3cd ] If the user specifies a hostname or domain name as part of the ip= command-line option, preserve it and don't overwrite it with one supplied by DHCP/BOOTP. For instance, ip=::::myhostname::dhcp will use "myhostname" rather than ignoring and overwriting it. Fix the comment on ic_bootp_string that suggests it only copies a string "if not already set"; it doesn't have any such logic. Signed-off-by: Josh Triplett Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/ipconfig.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f0782c91514c..41e384834d50 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -881,7 +881,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* - * Copy BOOTP-supplied string if not already set. + * Copy BOOTP-supplied string */ static int __init ic_bootp_string(char *dest, char *src, int len, int max) { @@ -930,12 +930,15 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(utsname()->nodename, ext+1, *ext, - __NEW_UTS_LEN); - ic_host_name_set = 1; + if (!ic_host_name_set) { + ic_bootp_string(utsname()->nodename, ext+1, *ext, + __NEW_UTS_LEN); + ic_host_name_set = 1; + } break; case 15: /* Domain name (DNS) */ - ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + if (!ic_domain[0]) + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); break; case 17: /* Root path */ if (!root_server_path[0]) From 908e53138e71d1719416565d555684d387baa096 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 2 Jun 2021 18:15:04 +0800 Subject: [PATCH 016/116] rtnetlink: Fix missing error code in rtnl_bridge_notify() [ Upstream commit a8db57c1d285c758adc7fb43d6e2bad2554106e1 ] The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'err'. Eliminate the follow smatch warning: net/core/rtnetlink.c:4834 rtnl_bridge_notify() warn: missing error code 'err'. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/rtnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0168c700a201..fa3ed51f846b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3648,8 +3648,10 @@ static int rtnl_bridge_notify(struct net_device *dev) if (err < 0) goto errout; - if (!skb->len) + if (!skb->len) { + err = -EINVAL; goto errout; + } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; From d0a1561cbdc3a66dd7460bae19a13efd1291cea0 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 22:06:30 +0800 Subject: [PATCH 017/116] net/x25: Return the correct errno code [ Upstream commit d7736958668c4facc15f421e622ffd718f5be80a ] When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/x25/af_x25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 987e5f8cafbe..fd0a6c6c77b6 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -550,7 +550,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol, if (protocol) goto out; - rc = -ENOBUFS; + rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; From 666273bc36b98a28e577527018f2dd38cc700fef Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 22:06:40 +0800 Subject: [PATCH 018/116] net: Return the correct errno code [ Upstream commit 49251cd00228a3c983651f6bb2f33f6a0b8f152e ] When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/compat.c b/net/compat.c index 45349658ed01..2ec822f4e409 100644 --- a/net/compat.c +++ b/net/compat.c @@ -158,7 +158,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, if (kcmlen > stackbuf_size) kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL); if (kcmsg == NULL) - return -ENOBUFS; + return -ENOMEM; /* Now copy them over neatly. */ memset(kcmsg, 0, kcmlen); From 337b6a80b996a5acd49b9f77d75d0e41469b42af Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 2 Jun 2021 22:06:58 +0800 Subject: [PATCH 019/116] fib: Return the correct errno code [ Upstream commit 59607863c54e9eb3f69afc5257dfe71c38bb751e ] When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9bb321df0869..76c3f602ee15 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -928,7 +928,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, { struct net *net; struct sk_buff *skb; - int err = -ENOBUFS; + int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); From e5747870e3b349f0d4654871bd2620f65d6d5b47 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 21 May 2021 19:13:10 -0700 Subject: [PATCH 020/116] dmaengine: ALTERA_MSGDMA depends on HAS_IOMEM [ Upstream commit 253697b93c2a1c237d34d3ae326e394aeb0ca7b3 ] When CONFIG_HAS_IOMEM is not set/enabled, certain iomap() family functions [including ioremap(), devm_ioremap(), etc.] are not available. Drivers that use these functions should depend on HAS_IOMEM so that they do not cause build errors. Repairs this build error: s390-linux-ld: drivers/dma/altera-msgdma.o: in function `request_and_map': altera-msgdma.c:(.text+0x14b0): undefined reference to `devm_ioremap' Fixes: a85c6f1b2921 ("dmaengine: Add driver for Altera / Intel mSGDMA IP core") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Stefan Roese Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Reviewed-by: Stefan Roese Phone: (+49)-8142-66989-51 Fax: (+49)-8142-66989-80 Email: sr@denx.de Link: https://lore.kernel.org/r/20210522021313.16405-2-rdunlap@infradead.org Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 79b809dbfda0..ff69feefc1c6 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -58,6 +58,7 @@ config DMA_OF #devices config ALTERA_MSGDMA tristate "Altera / Intel mSGDMA Engine" + depends on HAS_IOMEM select DMA_ENGINE help Enable support for Altera / Intel mSGDMA controller. From bba419257210b08a68d7389ef01fbf3f4f91e496 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 21 May 2021 19:13:11 -0700 Subject: [PATCH 021/116] dmaengine: QCOM_HIDMA_MGMT depends on HAS_IOMEM [ Upstream commit 0cfbb589d67f16fa55b26ae02b69c31b52e344b1 ] When CONFIG_HAS_IOMEM is not set/enabled, certain iomap() family functions [including ioremap(), devm_ioremap(), etc.] are not available. Drivers that use these functions should depend on HAS_IOMEM so that they do not cause build errors. Rectifies these build errors: s390-linux-ld: drivers/dma/qcom/hidma_mgmt.o: in function `hidma_mgmt_probe': hidma_mgmt.c:(.text+0x780): undefined reference to `devm_ioremap_resource' s390-linux-ld: drivers/dma/qcom/hidma_mgmt.o: in function `hidma_mgmt_init': hidma_mgmt.c:(.init.text+0x126): undefined reference to `of_address_to_resource' s390-linux-ld: hidma_mgmt.c:(.init.text+0x16e): undefined reference to `of_address_to_resource' Fixes: 67a2003e0607 ("dmaengine: add Qualcomm Technologies HIDMA channel driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Sinan Kaya Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20210522021313.16405-3-rdunlap@infradead.org Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/qcom/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/qcom/Kconfig b/drivers/dma/qcom/Kconfig index a7761c4025f4..a97c7123d913 100644 --- a/drivers/dma/qcom/Kconfig +++ b/drivers/dma/qcom/Kconfig @@ -9,6 +9,7 @@ config QCOM_BAM_DMA config QCOM_HIDMA_MGMT tristate "Qualcomm Technologies HIDMA Management support" + depends on HAS_IOMEM select DMA_ENGINE help Enable support for the Qualcomm Technologies HIDMA Management. From 2abf4d090d7132243a3c3205ab79175c32673874 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 18 May 2021 22:11:08 +0800 Subject: [PATCH 022/116] dmaengine: stedma40: add missing iounmap() on error in d40_probe() [ Upstream commit fffdaba402cea79b8d219355487d342ec23f91c6 ] Add the missing iounmap() before return from d40_probe() in the error handling case. Fixes: 8d318a50b3d7 ("DMAENGINE: Support for ST-Ericssons DMA40 block v3") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210518141108.1324127-1-yangyingliang@huawei.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/ste_dma40.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 90feb6a05e59..ee15d4fefbad 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -3656,6 +3656,9 @@ static int __init d40_probe(struct platform_device *pdev) kfree(base->lcla_pool.base_unaligned); + if (base->lcpa_base) + iounmap(base->lcpa_base); + if (base->phy_lcpa) release_mem_region(base->phy_lcpa, base->lcpa_size); From d05267fd27a5c4f54e06daefa3035995d765ca0c Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 15 Jun 2021 18:23:32 -0700 Subject: [PATCH 023/116] mm/memory-failure: make sure wait for page writeback in memory_failure [ Upstream commit e8675d291ac007e1c636870db880f837a9ea112a ] Our syzkaller trigger the "BUG_ON(!list_empty(&inode->i_wb_list))" in clear_inode: kernel BUG at fs/inode.c:519! Internal error: Oops - BUG: 0 [#1] SMP Modules linked in: Process syz-executor.0 (pid: 249, stack limit = 0x00000000a12409d7) CPU: 1 PID: 249 Comm: syz-executor.0 Not tainted 4.19.95 Hardware name: linux,dummy-virt (DT) pstate: 80000005 (Nzcv daif -PAN -UAO) pc : clear_inode+0x280/0x2a8 lr : clear_inode+0x280/0x2a8 Call trace: clear_inode+0x280/0x2a8 ext4_clear_inode+0x38/0xe8 ext4_free_inode+0x130/0xc68 ext4_evict_inode+0xb20/0xcb8 evict+0x1a8/0x3c0 iput+0x344/0x460 do_unlinkat+0x260/0x410 __arm64_sys_unlinkat+0x6c/0xc0 el0_svc_common+0xdc/0x3b0 el0_svc_handler+0xf8/0x160 el0_svc+0x10/0x218 Kernel panic - not syncing: Fatal exception A crash dump of this problem show that someone called __munlock_pagevec to clear page LRU without lock_page: do_mmap -> mmap_region -> do_munmap -> munlock_vma_pages_range -> __munlock_pagevec. As a result memory_failure will call identify_page_state without wait_on_page_writeback. And after truncate_error_page clear the mapping of this page. end_page_writeback won't call sb_clear_inode_writeback to clear inode->i_wb_list. That will trigger BUG_ON in clear_inode! Fix it by checking PageWriteback too to help determine should we skip wait_on_page_writeback. Link: https://lkml.kernel.org/r/20210604084705.3729204-1-yangerkun@huawei.com Fixes: 0bc1f8b0682c ("hwpoison: fix the handling path of the victimized page frame that belong to non-LRU") Signed-off-by: yangerkun Acked-by: Naoya Horiguchi Cc: Jan Kara Cc: Theodore Ts'o Cc: Oscar Salvador Cc: Yu Kuai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/memory-failure.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 001b6bfccbfb..e7827b9e6397 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1267,7 +1267,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - if (!PageTransTail(p) && !PageLRU(p)) + /* + * __munlock_pagevec may clear a writeback page's LRU flag without + * page_lock. We need wait writeback completion for this page or it + * may trigger vfs BUG while evict inode. + */ + if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* From 77a99aad5bc3ea105806ebae6be3cbadc2fc615e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 18 May 2021 21:00:27 +0200 Subject: [PATCH 024/116] batman-adv: Avoid WARN_ON timing related checks [ Upstream commit 9f460ae31c4435fd022c443a6029352217a16ac1 ] The soft/batadv interface for a queued OGM can be changed during the time the OGM was queued for transmission and when the OGM is actually transmitted by the worker. But WARN_ON must be used to denote kernel bugs and not to print simple warnings. A warning can simply be printed using pr_warn. Reported-by: Tetsuo Handa Reported-by: syzbot+c0b807de416427ff3dd1@syzkaller.appspotmail.com Fixes: ef0a937f7a14 ("batman-adv: consider outgoing interface in OGM sending") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich Signed-off-by: Sasha Levin --- net/batman-adv/bat_iv_ogm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 7a723e124dbb..3ec16c48e768 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -585,8 +585,10 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) if (WARN_ON(!forw_packet->if_outgoing)) return; - if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) + if (forw_packet->if_outgoing->soft_iface != soft_iface) { + pr_warn("%s: soft interface switch for queued OGM\n", __func__); return; + } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) return; From 6dcea66d3bb519b426282588f38e884e07893c1f Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Tue, 8 Jun 2021 09:51:58 +0800 Subject: [PATCH 025/116] net: ipv4: fix memory leak in netlbl_cipsov4_add_std [ Upstream commit d612c3f3fae221e7ea736d196581c2217304bbbc ] Reported by syzkaller: BUG: memory leak unreferenced object 0xffff888105df7000 (size 64): comm "syz-executor842", pid 360, jiffies 4294824824 (age 22.546s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000e67ed558>] kmalloc include/linux/slab.h:590 [inline] [<00000000e67ed558>] kzalloc include/linux/slab.h:720 [inline] [<00000000e67ed558>] netlbl_cipsov4_add_std net/netlabel/netlabel_cipso_v4.c:145 [inline] [<00000000e67ed558>] netlbl_cipsov4_add+0x390/0x2340 net/netlabel/netlabel_cipso_v4.c:416 [<0000000006040154>] genl_family_rcv_msg_doit.isra.0+0x20e/0x320 net/netlink/genetlink.c:739 [<00000000204d7a1c>] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] [<00000000204d7a1c>] genl_rcv_msg+0x2bf/0x4f0 net/netlink/genetlink.c:800 [<00000000c0d6a995>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000d78b9d2c>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:811 [<000000009733081b>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000009733081b>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000d5fd43b8>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000a2d1e40>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000a2d1e40>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000321d1969>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000964e16bc>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000001615e288>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<000000004ee8b6a5>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<00000000171c7cee>] entry_SYSCALL_64_after_hwframe+0x44/0xae The memory of doi_def->map.std pointing is allocated in netlbl_cipsov4_add_std, but no place has freed it. It should be freed in cipso_v4_doi_free which frees the cipso DOI resource. Fixes: 96cb8e3313c7a ("[NetLabel]: CIPSOv4 and Unlabeled packet integration") Reported-by: Hulk Robot Signed-off-by: Nanyong Sun Acked-by: Paul Moore Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/cipso_ipv4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6a1b52b34e20..e8b8dd1cb157 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -486,6 +486,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); break; } kfree(doi_def); From 1f79bc8ae81c05eb112a53f981cb2c244ee50d02 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 8 Jun 2021 11:06:41 +0300 Subject: [PATCH 026/116] net: rds: fix memory leak in rds_recvmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 49bfcbfd989a8f1f23e705759a6bb099de2cff9f ] Syzbot reported memory leak in rds. The problem was in unputted refcount in case of error. int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { ... if (!rds_next_incoming(rs, &inc)) { ... } After this "if" inc refcount incremented and if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; goto out; } ... out: return ret; } in case of rds_cmsg_recv() fail the refcount won't be decremented. And it's easy to see from ftrace log, that rds_inc_addref() don't have rds_inc_put() pair in rds_recvmsg() after rds_cmsg_recv() 1) | rds_recvmsg() { 1) 3.721 us | rds_inc_addref(); 1) 3.853 us | rds_message_inc_copy_to_user(); 1) + 10.395 us | rds_cmsg_recv(); 1) + 34.260 us | } Fixes: bdbe6fbc6a2f ("RDS: recv.c") Reported-and-tested-by: syzbot+5134cdf021c4ed5aaa5f@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Reviewed-by: Håkon Bugge Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/rds/recv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/recv.c b/net/rds/recv.c index ef022d24f87a..a1b2bdab6655 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -663,7 +663,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; - goto out; + break; } rds_stats_inc(s_recv_delivered); From a0882f68f54f7a8b6308261acee9bd4faab5a69e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Jun 2021 11:49:01 +0200 Subject: [PATCH 027/116] udp: fix race between close() and udp_abort() [ Upstream commit a8b897c7bcd47f4147d066e22cc01d1026d7640e ] Kaustubh reported and diagnosed a panic in udp_lib_lookup(). The root cause is udp_abort() racing with close(). Both racing functions acquire the socket lock, but udp{v6}_destroy_sock() release it before performing destructive actions. We can't easily extend the socket lock scope to avoid the race, instead use the SOCK_DEAD flag to prevent udp_abort from doing any action when the critical race happens. Diagnosed-and-tested-by: Kaustubh Pandey Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/udp.c | 10 ++++++++++ net/ipv6/udp.c | 3 +++ 2 files changed, 13 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8f298f27f6ec..cf5c4d2f68c1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2337,6 +2337,9 @@ void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_key_false(&udp_encap_needed) && up->encap_type) { @@ -2570,10 +2573,17 @@ int udp_abort(struct sock *sk, int err) { lock_sock(sk); + /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing + * with close() + */ + if (sock_flag(sk, SOCK_DEAD)) + goto out; + sk->sk_err = err; sk->sk_error_report(sk); __udp_disconnect(sk, 0); +out: release_sock(sk); return 0; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 38ad3fac8c37..d9d25b9c07ae 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1434,6 +1434,9 @@ void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); lock_sock(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_v6_flush_pending_frames(sk); release_sock(sk); From 88cfd542f4390f992f47ab876accb2ab789e4056 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 9 Jun 2021 14:17:53 +0300 Subject: [PATCH 028/116] rtnetlink: Fix regression in bridge VLAN configuration [ Upstream commit d2e381c4963663bca6f30c3b996fa4dbafe8fcb5 ] Cited commit started returning errors when notification info is not filled by the bridge driver, resulting in the following regression: # ip link add name br1 type bridge vlan_filtering 1 # bridge vlan add dev br1 vid 555 self pvid untagged RTNETLINK answers: Invalid argument As long as the bridge driver does not fill notification info for the bridge device itself, an empty notification should not be considered as an error. This is explained in commit 59ccaaaa49b5 ("bridge: dont send notification when skb->len == 0 in rtnl_bridge_notify"). Fix by removing the error and add a comment to avoid future bugs. Fixes: a8db57c1d285 ("rtnetlink: Fix missing error code in rtnl_bridge_notify()") Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/rtnetlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fa3ed51f846b..3bcaecc7ba69 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3648,10 +3648,12 @@ static int rtnl_bridge_notify(struct net_device *dev) if (err < 0) goto errout; - if (!skb->len) { - err = -EINVAL; + /* Notification info is only filled for bridge ports, not the bridge + * device itself. Therefore, a zero notification length is valid and + * should not result in an error. + */ + if (!skb->len) goto errout; - } rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; From 674b5f0c6a4fc5d3abce877048290cea6091fcb1 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 10 Jun 2021 19:40:29 +0300 Subject: [PATCH 029/116] netfilter: synproxy: Fix out of bounds when parsing TCP options [ Upstream commit 5fc177ab759418c9537433e63301096e733fb915 ] The TCP option parser in synproxy (synproxy_parse_options) could read one byte out of bounds. When the length is 1, the execution flow gets into the loop, reads one byte of the opcode, and if the opcode is neither TCPOPT_EOL nor TCPOPT_NOP, it reads one more byte, which exceeds the length of 1. This fix is inspired by commit 9609dad263f8 ("ipv4: tcp_input: fix stack out of bounds when parsing TCP options."). v2 changes: Added an early return when length < 0 to avoid calling skb_header_pointer with negative length. Cc: Young Xiao <92siuyang@gmail.com> Fixes: 48b1de4c110a ("netfilter: add SYNPROXY core/target") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/netfilter/nf_synproxy_core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 49bd8bb16b18..9ff26eb0309a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -34,6 +34,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr; + if (unlikely(length < 0)) + return false; + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; @@ -50,6 +53,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, length--; continue; default: + if (length < 2) + return true; opsize = *ptr++; if (opsize < 2) return true; From 7fdb2534c6c8e7bba7e31062a8fcea12e5cef4ba Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 11 Jun 2021 08:13:39 +0200 Subject: [PATCH 030/116] alx: Fix an error handling path in 'alx_probe()' [ Upstream commit 33e381448cf7a05d76ac0b47d4a6531ecd0e5c53 ] If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: ab69bde6b2e9 ("alx: add a simple AR816x/AR817x device driver") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/atheros/alx/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c index ce2e64410823..bb0e217460c9 100644 --- a/drivers/net/ethernet/atheros/alx/main.c +++ b/drivers/net/ethernet/atheros/alx/main.c @@ -1857,6 +1857,7 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) free_netdev(netdev); out_pci_release: pci_release_mem_regions(pdev); + pci_disable_pcie_error_reporting(pdev); out_pci_disable: pci_disable_device(pdev); return err; From de513ff9bd4b3d9d4e89c2e43fca682a2c1e8286 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 11 Jun 2021 15:16:11 +0800 Subject: [PATCH 031/116] net: stmmac: dwmac1000: Fix extended MAC address registers definition [ Upstream commit 1adb20f0d496b2c61e9aa1f4761b8d71f93d258e ] The register starts from 0x800 is the 16th MAC address register rather than the first one. Fixes: cffb13f4d6fb ("stmmac: extend mac addr reg and fix perfect filering") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/dwmac1000.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h index c02d36629c52..6f7ed3aaff1b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h @@ -87,10 +87,10 @@ enum power_event { #define LPI_CTRL_STATUS_TLPIEN 0x00000001 /* Transmit LPI Entry */ /* GMAC HW ADDR regs */ -#define GMAC_ADDR_HIGH(reg) (((reg > 15) ? 0x00000800 : 0x00000040) + \ - (reg * 8)) -#define GMAC_ADDR_LOW(reg) (((reg > 15) ? 0x00000804 : 0x00000044) + \ - (reg * 8)) +#define GMAC_ADDR_HIGH(reg) ((reg > 15) ? 0x00000800 + (reg - 16) * 8 : \ + 0x00000040 + (reg * 8)) +#define GMAC_ADDR_LOW(reg) ((reg > 15) ? 0x00000804 + (reg - 16) * 8 : \ + 0x00000044 + (reg * 8)) #define GMAC_MAX_PERFECT_ADDRESSES 1 #define GMAC_PCS_BASE 0x000000c0 /* PCS register base */ From 8ace33c0f12d2506fd141ab72b60c1a876372184 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 12 Jun 2021 14:37:46 +0200 Subject: [PATCH 032/116] qlcnic: Fix an error handling path in 'qlcnic_probe()' [ Upstream commit cb3376604a676e0302258b01893911bdd7aa5278 ] If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: 451724c821c1 ("qlcnic: aer support") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 6684a4cb8b88..45361310eea0 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -2711,6 +2711,7 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) kfree(ahw); err_out_free_res: + pci_disable_pcie_error_reporting(pdev); pci_release_regions(pdev); err_out_disable_pdev: From 3215ac6dafab9d1270761fb400453e4a33342c9c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 12 Jun 2021 14:53:12 +0200 Subject: [PATCH 033/116] netxen_nic: Fix an error handling path in 'netxen_nic_probe()' [ Upstream commit 49a10c7b176295f8fafb338911cf028e97f65f4d ] If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: e87ad5539343 ("netxen: support pci error handlers") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index 8f8a1894378e..5b91c8f823ff 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -1624,6 +1624,8 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) free_netdev(netdev); err_out_free_res: + if (NX_IS_REVISION_P3(pdev->revision)) + pci_disable_pcie_error_reporting(pdev); pci_release_regions(pdev); err_out_disable_pdev: From bc0ed22faee92abb80b3cd838d65577e1464da0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 15 Jun 2021 01:05:49 -0700 Subject: [PATCH 034/116] net: cdc_ncm: switch to eth%d interface naming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c1a3d4067309451e68c33dbd356032549cc0bd8e ] This is meant to make the host side cdc_ncm interface consistently named just like the older CDC protocols: cdc_ether & cdc_ecm (and even rndis_host), which all use 'FLAG_ETHER | FLAG_POINTTOPOINT'. include/linux/usb/usbnet.h: #define FLAG_ETHER 0x0020 /* maybe use "eth%d" names */ #define FLAG_WLAN 0x0080 /* use "wlan%d" names */ #define FLAG_WWAN 0x0400 /* use "wwan%d" names */ #define FLAG_POINTTOPOINT 0x1000 /* possibly use "usb%d" names */ drivers/net/usb/usbnet.c @ line 1711: strcpy (net->name, "usb%d"); ... // heuristic: "usb%d" for links we know are two-host, // else "eth%d" when there's reasonable doubt. userspace // can rename the link if it knows better. if ((dev->driver_info->flags & FLAG_ETHER) != 0 && ((dev->driver_info->flags & FLAG_POINTTOPOINT) == 0 || (net->dev_addr [0] & 0x02) == 0)) strcpy (net->name, "eth%d"); /* WLAN devices should always be named "wlan%d" */ if ((dev->driver_info->flags & FLAG_WLAN) != 0) strcpy(net->name, "wlan%d"); /* WWAN devices should always be named "wwan%d" */ if ((dev->driver_info->flags & FLAG_WWAN) != 0) strcpy(net->name, "wwan%d"); So by using ETHER | POINTTOPOINT the interface naming is either usb%d or eth%d based on the global uniqueness of the mac address of the device. Without this 2.5gbps ethernet dongles which all seem to use the cdc_ncm driver end up being called usb%d instead of eth%d even though they're definitely not two-host. (All 1gbps & 5gbps ethernet usb dongles I've tested don't hit this problem due to use of different drivers, primarily r8152 and aqc111) Fixes tag is based purely on git blame, and is really just here to make sure this hits LTS branches newer than v4.5. Cc: Lorenzo Colitti Fixes: 4d06dd537f95 ("cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind") Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/cdc_ncm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 468db50eb5e7..5b5156508f7c 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1667,7 +1667,7 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) static const struct driver_info cdc_ncm_info = { .description = "CDC NCM", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET - | FLAG_LINK_INTR, + | FLAG_LINK_INTR | FLAG_ETHER, .bind = cdc_ncm_bind, .unbind = cdc_ncm_unbind, .manage_power = usbnet_manage_power, From c4e3be2e7742863e454ce31faf8fd0109c00050b Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 16 Jun 2021 10:48:33 +0800 Subject: [PATCH 035/116] net: usb: fix possible use-after-free in smsc75xx_bind [ Upstream commit 56b786d86694e079d8aad9b314e015cd4ac02a3d ] The commit 46a8b29c6306 ("net: usb: fix memory leak in smsc75xx_bind") fails to clean up the work scheduled in smsc75xx_reset-> smsc75xx_set_multicast, which leads to use-after-free if the work is scheduled to start after the deallocation. In addition, this patch also removes a dangling pointer - dev->data[0]. This patch calls cancel_work_sync to cancel the scheduled work and set the dangling pointer to NULL. Fixes: 46a8b29c6306 ("net: usb: fix memory leak in smsc75xx_bind") Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/smsc75xx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index 62f2862c9775..8b9fd4e071f3 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -1495,7 +1495,7 @@ static int smsc75xx_bind(struct usbnet *dev, struct usb_interface *intf) ret = smsc75xx_wait_ready(dev, 0); if (ret < 0) { netdev_warn(dev->net, "device not ready in smsc75xx_bind\n"); - goto err; + goto free_pdata; } smsc75xx_init_mac_address(dev); @@ -1504,7 +1504,7 @@ static int smsc75xx_bind(struct usbnet *dev, struct usb_interface *intf) ret = smsc75xx_reset(dev); if (ret < 0) { netdev_warn(dev->net, "smsc75xx_reset error %d\n", ret); - goto err; + goto cancel_work; } dev->net->netdev_ops = &smsc75xx_netdev_ops; @@ -1515,8 +1515,11 @@ static int smsc75xx_bind(struct usbnet *dev, struct usb_interface *intf) dev->net->max_mtu = MAX_SINGLE_PACKET_SIZE; return 0; -err: +cancel_work: + cancel_work_sync(&pdata->set_multicast); +free_pdata: kfree(pdata); + dev->data[0] = 0; return ret; } @@ -1527,7 +1530,6 @@ static void smsc75xx_unbind(struct usbnet *dev, struct usb_interface *intf) cancel_work_sync(&pdata->set_multicast); netif_dbg(dev, ifdown, dev->net, "free pdata\n"); kfree(pdata); - pdata = NULL; dev->data[0] = 0; } } From 6cff57eea3347f79f1867cc53e1093b6614138d8 Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Wed, 16 Jun 2021 17:59:25 +0800 Subject: [PATCH 036/116] net: ipv4: fix memory leak in ip_mc_add1_src [ Upstream commit d8e2973029b8b2ce477b564824431f3385c77083 ] BUG: memory leak unreferenced object 0xffff888101bc4c00 (size 32): comm "syz-executor527", pid 360, jiffies 4294807421 (age 19.329s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 01 00 00 00 00 00 00 00 ac 14 14 bb 00 00 02 00 ................ backtrace: [<00000000f17c5244>] kmalloc include/linux/slab.h:558 [inline] [<00000000f17c5244>] kzalloc include/linux/slab.h:688 [inline] [<00000000f17c5244>] ip_mc_add1_src net/ipv4/igmp.c:1971 [inline] [<00000000f17c5244>] ip_mc_add_src+0x95f/0xdb0 net/ipv4/igmp.c:2095 [<000000001cb99709>] ip_mc_source+0x84c/0xea0 net/ipv4/igmp.c:2416 [<0000000052cf19ed>] do_ip_setsockopt net/ipv4/ip_sockglue.c:1294 [inline] [<0000000052cf19ed>] ip_setsockopt+0x114b/0x30c0 net/ipv4/ip_sockglue.c:1423 [<00000000477edfbc>] raw_setsockopt+0x13d/0x170 net/ipv4/raw.c:857 [<00000000e75ca9bb>] __sys_setsockopt+0x158/0x270 net/socket.c:2117 [<00000000bdb993a8>] __do_sys_setsockopt net/socket.c:2128 [inline] [<00000000bdb993a8>] __se_sys_setsockopt net/socket.c:2125 [inline] [<00000000bdb993a8>] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2125 [<000000006a1ffdbd>] do_syscall_64+0x40/0x80 arch/x86/entry/common.c:47 [<00000000b11467c4>] entry_SYSCALL_64_after_hwframe+0x44/0xae In commit 24803f38a5c0 ("igmp: do not remove igmp souce list info when set link down"), the ip_mc_clear_src() in ip_mc_destroy_dev() was removed, because it was also called in igmpv3_clear_delrec(). Rough callgraph: inetdev_destroy -> ip_mc_destroy_dev -> igmpv3_clear_delrec -> ip_mc_clear_src -> RCU_INIT_POINTER(dev->ip_ptr, NULL) However, ip_mc_clear_src() called in igmpv3_clear_delrec() doesn't release in_dev->mc_list->sources. And RCU_INIT_POINTER() assigns the NULL to dev->ip_ptr. As a result, in_dev cannot be obtained through inetdev_by_index() and then in_dev->mc_list->sources cannot be released by ip_mc_del1_src() in the sock_close. Rough call sequence goes like: sock_close -> __sock_release -> inet_release -> ip_mc_drop_socket -> inetdev_by_index -> ip_mc_leave_src -> ip_mc_del_src -> ip_mc_del1_src So we still need to call ip_mc_clear_src() in ip_mc_destroy_dev() to free in_dev->mc_list->sources. Fixes: 24803f38a5c0 ("igmp: do not remove igmp souce list info ...") Reported-by: Hulk Robot Signed-off-by: Chengyang Fan Acked-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/igmp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b6f0ee01f2e0..a6b048ff30e6 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1790,6 +1790,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; + ip_mc_clear_src(i); ip_ma_put(i); } } From 2726f3cf4dd0363253fe873f9f8b7ae3cb09945f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2021 07:47:15 -0700 Subject: [PATCH 037/116] net/af_unix: fix a data-race in unix_dgram_sendmsg / unix_release_sock [ Upstream commit a494bd642d9120648b06bb7d28ce6d05f55a7819 ] While unix_may_send(sk, osk) is called while osk is locked, it appears unix_release_sock() can overwrite unix_peer() after this lock has been released, making KCSAN unhappy. Changing unix_release_sock() to access/change unix_peer() before lock is released should fix this issue. BUG: KCSAN: data-race in unix_dgram_sendmsg / unix_release_sock write to 0xffff88810465a338 of 8 bytes by task 20852 on cpu 1: unix_release_sock+0x4ed/0x6e0 net/unix/af_unix.c:558 unix_release+0x2f/0x50 net/unix/af_unix.c:859 __sock_release net/socket.c:599 [inline] sock_close+0x6c/0x150 net/socket.c:1258 __fput+0x25b/0x4e0 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0xae/0x130 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:175 [inline] exit_to_user_mode_prepare+0x156/0x190 kernel/entry/common.c:209 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:302 do_syscall_64+0x56/0x90 arch/x86/entry/common.c:57 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88810465a338 of 8 bytes by task 20888 on cpu 0: unix_may_send net/unix/af_unix.c:189 [inline] unix_dgram_sendmsg+0x923/0x1610 net/unix/af_unix.c:1712 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffff888167905400 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 20888 Comm: syz-executor.0 Not tainted 5.13.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/unix/af_unix.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 44ff3f5c22df..8e7054fc27f8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -535,12 +535,14 @@ static void unix_release_sock(struct sock *sk, int embrion) u->path.mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; + + skpair = unix_peer(sk); + unix_peer(sk) = NULL; + unix_state_unlock(sk); wake_up_interruptible_all(&u->peer_wait); - skpair = unix_peer(sk); - if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); @@ -555,7 +557,6 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ - unix_peer(sk) = NULL; } /* Try to flush out this socket. Throw out buffers at least */ From 92c6f44142fb93731ccd4d9bdfea3e7453a285d1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 16 Jun 2021 20:43:37 +0200 Subject: [PATCH 038/116] be2net: Fix an error handling path in 'be_probe()' [ Upstream commit c19c8c0e666f9259e2fc4d2fa4b9ff8e3b40ee5d ] If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it must be undone by a corresponding 'pci_disable_pcie_error_reporting()' call, as already done in the remove function. Fixes: d6b6d9877878 ("be2net: use PCIe AER capability") Signed-off-by: Christophe JAILLET Acked-by: Somnath Kotur Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/emulex/benet/be_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index cabeb1790db7..43ae124cabff 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5926,6 +5926,7 @@ static int be_probe(struct pci_dev *pdev, const struct pci_device_id *pdev_id) unmap_bars: be_unmap_pci_bars(adapter); free_netdev: + pci_disable_pcie_error_reporting(pdev); free_netdev(netdev); rel_reg: pci_release_regions(pdev); From 765a8a04f828db7222b36a42b1031f576bfe95c3 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Wed, 16 Jun 2021 22:09:06 +0300 Subject: [PATCH 039/116] net: hamradio: fix memory leak in mkiss_close [ Upstream commit 7edcc682301492380fbdd604b4516af5ae667a13 ] My local syzbot instance hit memory leak in mkiss_open()[1]. The problem was in missing free_netdev() in mkiss_close(). In mkiss_open() netdevice is allocated and then registered, but in mkiss_close() netdevice was only unregistered, but not freed. Fail log: BUG: memory leak unreferenced object 0xffff8880281ba000 (size 4096): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): 61 78 30 00 00 00 00 00 00 00 00 00 00 00 00 00 ax0............. 00 27 fa 2a 80 88 ff ff 00 00 00 00 00 00 00 00 .'.*............ backtrace: [] kvmalloc_node+0x61/0xf0 [] alloc_netdev_mqs+0x98/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff8880141a9a00 (size 96): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): e8 a2 1b 28 80 88 ff ff e8 a2 1b 28 80 88 ff ff ...(.......(.... 98 92 9c aa b0 40 02 00 00 00 00 00 00 00 00 00 .....@.......... backtrace: [] __hw_addr_create_ex+0x5b/0x310 [] __hw_addr_add_ex+0x1f8/0x2b0 [] dev_addr_init+0x10b/0x1f0 [] alloc_netdev_mqs+0x13b/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff8880219bfc00 (size 512): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): 00 a0 1b 28 80 88 ff ff 80 8f b1 8d ff ff ff ff ...(............ 80 8f b1 8d ff ff ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [] kvmalloc_node+0x61/0xf0 [] alloc_netdev_mqs+0x777/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae BUG: memory leak unreferenced object 0xffff888029b2b200 (size 256): comm "syz-executor.1", pid 11443, jiffies 4295046091 (age 17.660s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kvmalloc_node+0x61/0xf0 [] alloc_netdev_mqs+0x912/0xe80 [] mkiss_open+0xb2/0x6f0 [1] [] tty_ldisc_open+0x9b/0x110 [] tty_set_ldisc+0x2e8/0x670 [] tty_ioctl+0xda3/0x1440 [] __x64_sys_ioctl+0x193/0x200 [] do_syscall_64+0x3a/0xb0 [] entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 815f62bf7427 ("[PATCH] SMP rewrite of mkiss") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/hamradio/mkiss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 9fd7dab42a53..2074fc55a88a 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -810,6 +810,7 @@ static void mkiss_close(struct tty_struct *tty) ax->tty = NULL; unregister_netdev(ax->dev); + free_netdev(ax->dev); } /* Perform I/O control on an active ax25 channel. */ From 1bcacd6088d61c0ac6a990d87975600a81f3247e Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Thu, 17 Jun 2021 07:32:32 +0800 Subject: [PATCH 040/116] net: cdc_eem: fix tx fixup skb leak [ Upstream commit c3b26fdf1b32f91c7a3bc743384b4a298ab53ad7 ] when usbnet transmit a skb, eem fixup it in eem_tx_fixup(), if skb_copy_expand() failed, it return NULL, usbnet_start_xmit() will have no chance to free original skb. fix it by free orginal skb in eem_tx_fixup() first, then check skb clone status, if failed, return NULL to usbnet. Fixes: 9f722c0978b0 ("usbnet: CDC EEM support (v5)") Signed-off-by: Linyu Yuan Reviewed-by: Greg Kroah-Hartman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/cdc_eem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_eem.c b/drivers/net/usb/cdc_eem.c index f7180f8db39e..9c15e1a1261b 100644 --- a/drivers/net/usb/cdc_eem.c +++ b/drivers/net/usb/cdc_eem.c @@ -138,10 +138,10 @@ static struct sk_buff *eem_tx_fixup(struct usbnet *dev, struct sk_buff *skb, } skb2 = skb_copy_expand(skb, EEM_HEAD, ETH_FCS_LEN + padlen, flags); + dev_kfree_skb_any(skb); if (!skb2) return NULL; - dev_kfree_skb_any(skb); skb = skb2; done: From 6ae141218d681ffccc477959c3bcc8a5dbe9969a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 18 Jun 2021 13:04:35 +0200 Subject: [PATCH 041/116] icmp: don't send out ICMP messages with a source address of 0.0.0.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 321827477360934dc040e9d3c626bf1de6c3ab3c ] When constructing ICMP response messages, the kernel will try to pick a suitable source address for the outgoing packet. However, if no IPv4 addresses are configured on the system at all, this will fail and we end up producing an ICMP message with a source address of 0.0.0.0. This can happen on a box routing IPv4 traffic via v6 nexthops, for instance. Since 0.0.0.0 is not generally routable on the internet, there's a good chance that such ICMP messages will never make it back to the sender of the original packet that the ICMP message was sent in response to. This, in turn, can create connectivity and PMTUd problems for senders. Fortunately, RFC7600 reserves a dummy address to be used as a source for ICMP messages (192.0.0.8/32), so let's teach the kernel to substitute that address as a last resort if the regular source address selection procedure fails. Below is a quick example reproducing this issue with network namespaces: ip netns add ns0 ip l add type veth peer netns ns0 ip l set dev veth0 up ip a add 10.0.0.1/24 dev veth0 ip a add fc00:dead:cafe:42::1/64 dev veth0 ip r add 10.1.0.0/24 via inet6 fc00:dead:cafe:42::2 ip -n ns0 l set dev veth0 up ip -n ns0 a add fc00:dead:cafe:42::2/64 dev veth0 ip -n ns0 r add 10.0.0.0/24 via inet6 fc00:dead:cafe:42::1 ip netns exec ns0 sysctl -w net.ipv4.icmp_ratelimit=0 ip netns exec ns0 sysctl -w net.ipv4.ip_forward=1 tcpdump -tpni veth0 -c 2 icmp & ping -w 1 10.1.0.1 > /dev/null tcpdump: verbose output suppressed, use -v[v]... for full protocol decode listening on veth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 29, seq 1, length 64 IP 0.0.0.0 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92 2 packets captured 2 packets received by filter 0 packets dropped by kernel With this patch the above capture changes to: IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 31127, seq 1, length 64 IP 192.0.0.8 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Juliusz Chroboczek Reviewed-by: David Ahern Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/uapi/linux/in.h | 3 +++ net/ipv4/icmp.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 48e8a225b985..2a66ab49f14d 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -280,6 +280,9 @@ struct sockaddr_in { /* Address indicating an error return. */ #define INADDR_NONE ((unsigned long int) 0xffffffff) +/* Dummy address for src of ICMP replies if no real address is set (RFC7600). */ +#define INADDR_DUMMY ((unsigned long int) 0xc0000008) + /* Network number for local host loopback. */ #define IN_LOOPBACKNET 127 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 96ee1fbd999e..ba07f128d7ad 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -743,6 +743,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); + /* if we don't have a source address at this point, fall back to the + * dummy address instead of sending out a packet with a source address + * of 0.0.0.0 + */ + if (!fl4.saddr) + fl4.saddr = htonl(INADDR_DUMMY); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); From b1ad283755095a4b9d1431aeb357d7df1a33d3bb Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 18 Jun 2021 16:49:02 +0300 Subject: [PATCH 042/116] net: ethernet: fix potential use-after-free in ec_bhf_remove [ Upstream commit 9cca0c2d70149160407bda9a9446ce0c29b6e6c6 ] static void ec_bhf_remove(struct pci_dev *dev) { ... struct ec_bhf_priv *priv = netdev_priv(net_dev); unregister_netdev(net_dev); free_netdev(net_dev); pci_iounmap(dev, priv->dma_io); pci_iounmap(dev, priv->io); ... } priv is netdev private data, but it is used after free_netdev(). It can cause use-after-free when accessing priv pointer. So, fix it by moving free_netdev() after pci_iounmap() calls. Fixes: 6af55ff52b02 ("Driver for Beckhoff CX5020 EtherCAT master module.") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/ec_bhf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c index 1b79a6defd56..5561dd295324 100644 --- a/drivers/net/ethernet/ec_bhf.c +++ b/drivers/net/ethernet/ec_bhf.c @@ -585,10 +585,12 @@ static void ec_bhf_remove(struct pci_dev *dev) struct ec_bhf_priv *priv = netdev_priv(net_dev); unregister_netdev(net_dev); - free_netdev(net_dev); pci_iounmap(dev, priv->dma_io); pci_iounmap(dev, priv->io); + + free_netdev(net_dev); + pci_release_regions(dev); pci_clear_master(dev); pci_disable_device(dev); From 596ca7e388c30052a179d0e641fcfb24ec744988 Mon Sep 17 00:00:00 2001 From: Chen Li Date: Fri, 4 Jun 2021 16:43:02 +0800 Subject: [PATCH 043/116] radeon: use memcpy_to/fromio for UVD fw upload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ab8363d3875a83f4901eb1cc00ce8afd24de6c85 ] I met a gpu addr bug recently and the kernel log tells me the pc is memcpy/memset and link register is radeon_uvd_resume. As we know, in some architectures, optimized memcpy/memset may not work well on device memory. Trival memcpy_toio/memset_io can fix this problem. BTW, amdgpu has already done it in: commit ba0b2275a678 ("drm/amdgpu: use memcpy_to/fromio for UVD fw upload"), that's why it has no this issue on the same gpu and platform. Signed-off-by: Chen Li Reviewed-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/radeon/radeon_uvd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index 95f4db70dd22..fde9c69ecc86 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -286,7 +286,7 @@ int radeon_uvd_resume(struct radeon_device *rdev) if (rdev->uvd.vcpu_bo == NULL) return -EINVAL; - memcpy(rdev->uvd.cpu_addr, rdev->uvd_fw->data, rdev->uvd_fw->size); + memcpy_toio((void __iomem *)rdev->uvd.cpu_addr, rdev->uvd_fw->data, rdev->uvd_fw->size); size = radeon_bo_size(rdev->uvd.vcpu_bo); size -= rdev->uvd_fw->size; @@ -294,7 +294,7 @@ int radeon_uvd_resume(struct radeon_device *rdev) ptr = rdev->uvd.cpu_addr; ptr += rdev->uvd_fw->size; - memset(ptr, 0, size); + memset_io((void __iomem *)ptr, 0, size); return 0; } From 23f97d0f3176636dbb2ffd10a1a03efd70be5290 Mon Sep 17 00:00:00 2001 From: Riwen Lu Date: Fri, 4 Jun 2021 11:09:59 +0800 Subject: [PATCH 044/116] hwmon: (scpi-hwmon) shows the negative temperature properly [ Upstream commit 78d13552346289bad4a9bf8eabb5eec5e5a321a5 ] The scpi hwmon shows the sub-zero temperature in an unsigned integer, which would confuse the users when the machine works in low temperature environment. This shows the sub-zero temperature in an signed value and users can get it properly from sensors. Signed-off-by: Riwen Lu Tested-by: Xin Chen Link: https://lore.kernel.org/r/20210604030959.736379-1-luriwen@kylinos.cn Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/scpi-hwmon.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/hwmon/scpi-hwmon.c b/drivers/hwmon/scpi-hwmon.c index 7e49da50bc69..562f3e287297 100644 --- a/drivers/hwmon/scpi-hwmon.c +++ b/drivers/hwmon/scpi-hwmon.c @@ -107,6 +107,15 @@ scpi_show_sensor(struct device *dev, struct device_attribute *attr, char *buf) scpi_scale_reading(&value, sensor); + /* + * Temperature sensor values are treated as signed values based on + * observation even though that is not explicitly specified, and + * because an unsigned u64 temperature does not really make practical + * sense especially when the temperature is below zero degrees Celsius. + */ + if (sensor->info.class == TEMPERATURE) + return sprintf(buf, "%lld\n", (s64)value); + return sprintf(buf, "%llu\n", value); } From 4fa028860bb1656f370851c2c26de15fc67da300 Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Sat, 12 Jun 2021 22:18:54 +0200 Subject: [PATCH 045/116] can: bcm: fix infoleak in struct bcm_msg_head commit 5e87ddbe3942e27e939bdc02deb8579b0cbd8ecc upstream. On 64-bit systems, struct bcm_msg_head has an added padding of 4 bytes between struct members count and ival1. Even though all struct members are initialized, the 4-byte hole will contain data from the kernel stack. This patch zeroes out struct bcm_msg_head before usage, preventing infoleaks to userspace. Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Link: https://lore.kernel.org/r/trinity-7c1b2e82-e34f-4885-8060-2cd7a13769ce-1623532166177@3c-app-gmx-bs52 Cc: linux-stable Signed-off-by: Norbert Slusarek Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/bcm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/can/bcm.c b/net/can/bcm.c index 12d851c4604d..f691afcc5b8b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -406,6 +406,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data) if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ + memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; @@ -453,6 +454,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) /* this element is not throttled anymore */ data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); + memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; @@ -567,6 +569,7 @@ static void bcm_rx_timeout_tsklet(unsigned long data) struct bcm_msg_head msg_head; /* create notification to user */ + memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; From 89df95ce32be204eef2e7d4b2f6fb552fb191a68 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 10 Jun 2021 00:58:33 +0300 Subject: [PATCH 046/116] can: mcba_usb: fix memory leak in mcba_usb commit 91c02557174be7f72e46ed7311e3bea1939840b0 upstream. Syzbot reported memory leak in SocketCAN driver for Microchip CAN BUS Analyzer Tool. The problem was in unfreed usb_coherent. In mcba_usb_start() 20 coherent buffers are allocated and there is nothing, that frees them: 1) In callback function the urb is resubmitted and that's all 2) In disconnect function urbs are simply killed, but URB_FREE_BUFFER is not set (see mcba_usb_start) and this flag cannot be used with coherent buffers. Fail log: | [ 1354.053291][ T8413] mcba_usb 1-1:0.0 can0: device disconnected | [ 1367.059384][ T8420] kmemleak: 20 new suspected memory leaks (see /sys/kernel/debug/kmem) So, all allocated buffers should be freed with usb_free_coherent() explicitly NOTE: The same pattern for allocating and freeing coherent buffers is used in drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c Fixes: 51f3baad7de9 ("can: mcba_usb: Add support for Microchip CAN BUS Analyzer") Link: https://lore.kernel.org/r/20210609215833.30393-1-paskripkin@gmail.com Cc: linux-stable Reported-and-tested-by: syzbot+57281c762a3922e14dfe@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/mcba_usb.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index a09e3f6c2c50..6b0c6009dde0 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -93,6 +93,8 @@ struct mcba_priv { bool can_ka_first_pass; bool can_speed_check; atomic_t free_ctx_cnt; + void *rxbuf[MCBA_MAX_RX_URBS]; + dma_addr_t rxbuf_dma[MCBA_MAX_RX_URBS]; }; /* CAN frame */ @@ -644,6 +646,7 @@ static int mcba_usb_start(struct mcba_priv *priv) for (i = 0; i < MCBA_MAX_RX_URBS; i++) { struct urb *urb = NULL; u8 *buf; + dma_addr_t buf_dma; /* create a URB, and a buffer for it */ urb = usb_alloc_urb(0, GFP_KERNEL); @@ -653,7 +656,7 @@ static int mcba_usb_start(struct mcba_priv *priv) } buf = usb_alloc_coherent(priv->udev, MCBA_USB_RX_BUFF_SIZE, - GFP_KERNEL, &urb->transfer_dma); + GFP_KERNEL, &buf_dma); if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); usb_free_urb(urb); @@ -672,11 +675,14 @@ static int mcba_usb_start(struct mcba_priv *priv) if (err) { usb_unanchor_urb(urb); usb_free_coherent(priv->udev, MCBA_USB_RX_BUFF_SIZE, - buf, urb->transfer_dma); + buf, buf_dma); usb_free_urb(urb); break; } + priv->rxbuf[i] = buf; + priv->rxbuf_dma[i] = buf_dma; + /* Drop reference, USB core will take care of freeing it */ usb_free_urb(urb); } @@ -719,7 +725,14 @@ static int mcba_usb_open(struct net_device *netdev) static void mcba_urb_unlink(struct mcba_priv *priv) { + int i; + usb_kill_anchored_urbs(&priv->rx_submitted); + + for (i = 0; i < MCBA_MAX_RX_URBS; ++i) + usb_free_coherent(priv->udev, MCBA_USB_RX_BUFF_SIZE, + priv->rxbuf[i], priv->rxbuf_dma[i]); + usb_kill_anchored_urbs(&priv->tx_submitted); } From fb28ec5c76c1252c8654f2e8209e9d71ab3a3931 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 14 Jun 2021 17:55:23 +0200 Subject: [PATCH 047/116] usb: core: hub: Disable autosuspend for Cypress CY7C65632 commit a7d8d1c7a7f73e780aa9ae74926ae5985b2f895f upstream. The Cypress CY7C65632 appears to have an issue with auto suspend and detecting devices, not too dissimilar to the SMSC 5534B hub. It is easiest to reproduce by connecting multiple mass storage devices to the hub at the same time. On a Lenovo Yoga, around 1 in 3 attempts result in the devices not being detected. It is however possible to make them appear using lsusb -v. Disabling autosuspend for this hub resolves the issue. Fixes: 1208f9e1d758 ("USB: hub: Fix the broken detection of USB3 device in SMSC hub") Cc: stable@vger.kernel.org Signed-off-by: Andrew Lunn Link: https://lore.kernel.org/r/20210614155524.2228800-1-andrew@lunn.ch Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 40743e9e9d93..276a63ae510e 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -38,6 +38,8 @@ #define USB_VENDOR_GENESYS_LOGIC 0x05e3 #define USB_VENDOR_SMSC 0x0424 #define USB_PRODUCT_USB5534B 0x5534 +#define USB_VENDOR_CYPRESS 0x04b4 +#define USB_PRODUCT_CY7C65632 0x6570 #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND 0x01 #define HUB_QUIRK_DISABLE_AUTOSUSPEND 0x02 @@ -5325,6 +5327,11 @@ static const struct usb_device_id hub_id_table[] = { .idProduct = USB_PRODUCT_USB5534B, .bInterfaceClass = USB_CLASS_HUB, .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, + { .match_flags = USB_DEVICE_ID_MATCH_VENDOR + | USB_DEVICE_ID_MATCH_PRODUCT, + .idVendor = USB_VENDOR_CYPRESS, + .idProduct = USB_PRODUCT_CY7C65632, + .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_INT_CLASS, .idVendor = USB_VENDOR_GENESYS_LOGIC, From de4c5eef11e0d2b70b7e8c02216aecf802857182 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Jun 2021 13:47:25 -0400 Subject: [PATCH 048/116] tracing: Do not stop recording cmdlines when tracing is off commit 85550c83da421fb12dc1816c45012e1e638d2b38 upstream. The saved_cmdlines is used to map pids to the task name, such that the output of the tracing does not just show pids, but also gives a human readable name for the task. If the name is not mapped, the output looks like this: <...>-1316 [005] ...2 132.044039: ... Instead of this: gnome-shell-1316 [005] ...2 132.044039: ... The names are updated when tracing is running, but are skipped if tracing is stopped. Unfortunately, this stops the recording of the names if the top level tracer is stopped, and not if there's other tracers active. The recording of a name only happens when a new event is written into a ring buffer, so there is no need to test if tracing is on or not. If tracing is off, then no event is written and no need to test if tracing is off or not. Remove the check, as it hides the names of tasks for events in the instance buffers. Cc: stable@vger.kernel.org Fixes: 7ffbd48d5cab2 ("tracing: Cache comms only after an event occurred") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42531ee852ff..badc7c13da46 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2024,8 +2024,6 @@ static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; - if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) - return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; From 5ad487e6fa6a6452d289d584f4f2275a45a04196 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Jun 2021 14:32:34 -0400 Subject: [PATCH 049/116] tracing: Do not stop recording comms if the trace file is being read commit 4fdd595e4f9a1ff6d93ec702eaecae451cfc6591 upstream. A while ago, when the "trace" file was opened, tracing was stopped, and code was added to stop recording the comms to saved_cmdlines, for mapping of the pids to the task name. Code has been added that only records the comm if a trace event occurred, and there's no reason to not trace it if the trace file is opened. Cc: stable@vger.kernel.org Fixes: 7ffbd48d5cab2 ("tracing: Cache comms only after an event occurred") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index badc7c13da46..0cc67bc0666e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1737,9 +1737,6 @@ struct saved_cmdlines_buffer { }; static struct saved_cmdlines_buffer *savedcmd; -/* temporary disable recording */ -static atomic_t trace_record_taskinfo_disabled __read_mostly; - static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; @@ -3257,9 +3254,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EBUSY); #endif - if (!iter->snapshot) - atomic_inc(&trace_record_taskinfo_disabled); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -3302,9 +3296,6 @@ static void s_stop(struct seq_file *m, void *p) return; #endif - if (!iter->snapshot) - atomic_dec(&trace_record_taskinfo_disabled); - trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } From 43e1f748476db29003fb155c2f9696f5b25a7da1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Jun 2021 17:12:35 -0400 Subject: [PATCH 050/116] tracing: Do no increment trace_clock_global() by one commit 89529d8b8f8daf92d9979382b8d2eb39966846ea upstream. The trace_clock_global() tries to make sure the events between CPUs is somewhat in order. A global value is used and updated by the latest read of a clock. If one CPU is ahead by a little, and is read by another CPU, a lock is taken, and if the timestamp of the other CPU is behind, it will simply use the other CPUs timestamp. The lock is also only taken with a "trylock" due to tracing, and strange recursions can happen. The lock is not taken at all in NMI context. In the case where the lock is not able to be taken, the non synced timestamp is returned. But it will not be less than the saved global timestamp. The problem arises because when the time goes "backwards" the time returned is the saved timestamp plus 1. If the lock is not taken, and the plus one to the timestamp is returned, there's a small race that can cause the time to go backwards! CPU0 CPU1 ---- ---- trace_clock_global() { ts = clock() [ 1000 ] trylock(clock_lock) [ success ] global_ts = ts; [ 1000 ] trace_clock_global() { ts = clock() [ 999 ] if (ts < global_ts) ts = global_ts + 1 [ 1001 ] trylock(clock_lock) [ fail ] return ts [ 1001] } unlock(clock_lock); return ts; [ 1000 ] } trace_clock_global() { ts = clock() [ 1000 ] if (ts < global_ts) [ false 1000 == 1000 ] trylock(clock_lock) [ success ] global_ts = ts; [ 1000 ] unlock(clock_lock) return ts; [ 1000 ] } The above case shows to reads of trace_clock_global() on the same CPU, but the second read returns one less than the first read. That is, time when backwards, and this is not what is allowed by trace_clock_global(). This was triggered by heavy tracing and the ring buffer checker that tests for the clock going backwards: Ring buffer clock went backwards: 20613921464 -> 20613921463 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 0 at kernel/trace/ring_buffer.c:3412 check_buffer+0x1b9/0x1c0 Modules linked in: [..] [CPU: 2]TIME DOES NOT MATCH expected:20620711698 actual:20620711697 delta:6790234 before:20613921463 after:20613921463 [20613915818] PAGE TIME STAMP [20613915818] delta:0 [20613915819] delta:1 [20613916035] delta:216 [20613916465] delta:430 [20613916575] delta:110 [20613916749] delta:174 [20613917248] delta:499 [20613917333] delta:85 [20613917775] delta:442 [20613917921] delta:146 [20613918321] delta:400 [20613918568] delta:247 [20613918768] delta:200 [20613919306] delta:538 [20613919353] delta:47 [20613919980] delta:627 [20613920296] delta:316 [20613920571] delta:275 [20613920862] delta:291 [20613921152] delta:290 [20613921464] delta:312 [20613921464] delta:0 TIME EXTEND [20613921464] delta:0 This happened more than once, and always for an off by one result. It also started happening after commit aafe104aa9096 was added. Cc: stable@vger.kernel.org Fixes: aafe104aa9096 ("tracing: Restructure trace_clock_global() to never block") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index c82875834c42..b3b02d2c2926 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -114,9 +114,9 @@ u64 notrace trace_clock_global(void) prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); - /* Make sure that now is always greater than prev_time */ + /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; /* * If in an NMI context then dont risk lockups and simply return @@ -130,7 +130,7 @@ u64 notrace trace_clock_global(void) /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; trace_clock_struct.prev_time = now; From 159111174439c1ca280372e1aa10429b9b22043f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antti=20J=C3=A4rvinen?= Date: Mon, 15 Mar 2021 10:26:06 +0000 Subject: [PATCH 051/116] PCI: Mark TI C667X to avoid bus reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b5cf198e74a91073d12839a3e2db99994a39995d upstream. Some TI KeyStone C667X devices do not support bus/hot reset. The PCIESS automatically disables LTSSM when Secondary Bus Reset is received and device stops working. Prevent bus reset for these devices. With this change, the device can be assigned to VMs with VFIO, but it will leak state between VMs. Reference: https://e2e.ti.com/support/processors/f/791/t/954382 Link: https://lore.kernel.org/r/20210315102606.17153-1-antti.jarvinen@gmail.com Signed-off-by: Antti Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Kishon Vijay Abraham I Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 510cb05aa96f..f0cd8f7ce881 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3402,6 +3402,16 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0033, quirk_no_bus_reset); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0034, quirk_no_bus_reset); +/* + * Some TI KeyStone C667X devices do not support bus/hot reset. The PCIESS + * automatically disables LTSSM when Secondary Bus Reset is received and + * the device stops working. Prevent bus reset for these devices. With + * this change, the device can be assigned to VMs with VFIO, but it will + * leak state between VMs. Reference + * https://e2e.ti.com/support/processors/f/791/t/954382 + */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TI, 0xb005, quirk_no_bus_reset); + static void quirk_no_pm_reset(struct pci_dev *dev) { /* From 0deb965d5bed0c367f175b988487c8b524bb061e Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 8 Jun 2021 11:18:56 +0530 Subject: [PATCH 052/116] PCI: Mark some NVIDIA GPUs to avoid bus reset commit 4c207e7121fa92b66bf1896bf8ccb9edfb0f9731 upstream. Some NVIDIA GPU devices do not work with SBR. Triggering SBR leaves the device inoperable for the current system boot. It requires a system hard-reboot to get the GPU device back to normal operating condition post-SBR. For the affected devices, enable NO_BUS_RESET quirk to avoid the issue. This issue will be fixed in the next generation of hardware. Link: https://lore.kernel.org/r/20210608054857.18963-8-ameynarkhede03@gmail.com Signed-off-by: Shanker Donthineni Signed-off-by: Bjorn Helgaas Reviewed-by: Sinan Kaya Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f0cd8f7ce881..de7d65971d7f 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3389,6 +3389,18 @@ static void quirk_no_bus_reset(struct pci_dev *dev) dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; } +/* + * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be + * prevented for those affected devices. + */ +static void quirk_nvidia_no_bus_reset(struct pci_dev *dev) +{ + if ((dev->device & 0xffc0) == 0x2340) + quirk_no_bus_reset(dev); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + quirk_nvidia_no_bus_reset); + /* * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. * The device will throw a Link Down error on AER-capable systems and From 3aceaeefedfb88df35fcd67ff861f0973b86639e Mon Sep 17 00:00:00 2001 From: Sriharsha Basavapatna Date: Fri, 21 May 2021 21:13:17 -0400 Subject: [PATCH 053/116] PCI: Add ACS quirk for Broadcom BCM57414 NIC commit db2f77e2bd99dbd2fb23ddde58f0fae392fe3338 upstream. The Broadcom BCM57414 NIC may be a multi-function device. While it does not advertise an ACS capability, peer-to-peer transactions are not possible between the individual functions, so it is safe to treat them as fully isolated. Add an ACS quirk for this device so the functions can be in independent IOMMU groups and attached individually to userspace applications using VFIO. [bhelgaas: commit log] Link: https://lore.kernel.org/r/1621645997-16251-1-git-send-email-michael.chan@broadcom.com Signed-off-by: Sriharsha Basavapatna Signed-off-by: Michael Chan Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index de7d65971d7f..3602e967e96a 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4684,6 +4684,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs }, + /* Broadcom multi-function device */ + { PCI_VENDOR_ID_BROADCOM, 0x16D7, pci_quirk_mf_endpoint_acs }, { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, { 0 } }; From 5e6cbd2a05f525a5803bfbb163288b564163dc37 Mon Sep 17 00:00:00 2001 From: Chiqijun Date: Mon, 24 May 2021 17:44:07 -0500 Subject: [PATCH 054/116] PCI: Work around Huawei Intelligent NIC VF FLR erratum commit ce00322c2365e1f7b0312f2f493539c833465d97 upstream. pcie_flr() starts a Function Level Reset (FLR), waits 100ms (the maximum time allowed for FLR completion by PCIe r5.0, sec 6.6.2), and waits for the FLR to complete. It assumes the FLR is complete when a config read returns valid data. When we do an FLR on several Huawei Intelligent NIC VFs at the same time, firmware on the NIC processes them serially. The VF may respond to config reads before the firmware has completed its reset processing. If we bind a driver to the VF (e.g., by assigning the VF to a virtual machine) in the interval between the successful config read and completion of the firmware reset processing, the NIC VF driver may fail to load. Prevent this driver failure by waiting for the NIC firmware to complete its reset processing. Not all NIC firmware supports this feature. [bhelgaas: commit log] Link: https://support.huawei.com/enterprise/en/doc/EDOC1100063073/87950645/vm-oss-occasionally-fail-to-load-the-in200-driver-when-the-vf-performs-flr Link: https://lore.kernel.org/r/20210414132301.1793-1-chiqijun@huawei.com Signed-off-by: Chiqijun Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/pci/quirks.c | 65 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 3602e967e96a..db1ec8209b56 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3875,6 +3875,69 @@ static int reset_chelsio_generic_dev(struct pci_dev *dev, int probe) #define PCI_DEVICE_ID_INTEL_IVB_M_VGA 0x0156 #define PCI_DEVICE_ID_INTEL_IVB_M2_VGA 0x0166 +#define PCI_DEVICE_ID_HINIC_VF 0x375E +#define HINIC_VF_FLR_TYPE 0x1000 +#define HINIC_VF_FLR_CAP_BIT (1UL << 30) +#define HINIC_VF_OP 0xE80 +#define HINIC_VF_FLR_PROC_BIT (1UL << 18) +#define HINIC_OPERATION_TIMEOUT 15000 /* 15 seconds */ + +/* Device-specific reset method for Huawei Intelligent NIC virtual functions */ +static int reset_hinic_vf_dev(struct pci_dev *pdev, int probe) +{ + unsigned long timeout; + void __iomem *bar; + u32 val; + + if (probe) + return 0; + + bar = pci_iomap(pdev, 0, 0); + if (!bar) + return -ENOTTY; + + /* Get and check firmware capabilities */ + val = ioread32be(bar + HINIC_VF_FLR_TYPE); + if (!(val & HINIC_VF_FLR_CAP_BIT)) { + pci_iounmap(pdev, bar); + return -ENOTTY; + } + + /* Set HINIC_VF_FLR_PROC_BIT for the start of FLR */ + val = ioread32be(bar + HINIC_VF_OP); + val = val | HINIC_VF_FLR_PROC_BIT; + iowrite32be(val, bar + HINIC_VF_OP); + + pcie_flr(pdev); + + /* + * The device must recapture its Bus and Device Numbers after FLR + * in order generate Completions. Issue a config write to let the + * device capture this information. + */ + pci_write_config_word(pdev, PCI_VENDOR_ID, 0); + + /* Firmware clears HINIC_VF_FLR_PROC_BIT when reset is complete */ + timeout = jiffies + msecs_to_jiffies(HINIC_OPERATION_TIMEOUT); + do { + val = ioread32be(bar + HINIC_VF_OP); + if (!(val & HINIC_VF_FLR_PROC_BIT)) + goto reset_complete; + msleep(20); + } while (time_before(jiffies, timeout)); + + val = ioread32be(bar + HINIC_VF_OP); + if (!(val & HINIC_VF_FLR_PROC_BIT)) + goto reset_complete; + + pci_warn(pdev, "Reset dev timeout, FLR ack reg: %#010x\n", val); + +reset_complete: + pci_iounmap(pdev, bar); + + return 0; +} + static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF, reset_intel_82599_sfp_virtfn }, @@ -3884,6 +3947,8 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { reset_ivb_igd }, { PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID, reset_chelsio_generic_dev }, + { PCI_VENDOR_ID_HUAWEI, PCI_DEVICE_ID_HINIC_VF, + reset_hinic_vf_dev }, { 0 } }; From 1c3e3f6d88437844e2edd22e4d1ae074bac7e352 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 8 Jun 2021 19:39:25 -0700 Subject: [PATCH 055/116] ARCv2: save ABI registers across signal handling commit 96f1b00138cb8f04c742c82d0a7c460b2202e887 upstream. ARCv2 has some configuration dependent registers (r30, r58, r59) which could be targetted by the compiler. To keep the ABI stable, these were unconditionally part of the glibc ABI (sysdeps/unix/sysv/linux/arc/sys/ucontext.h:mcontext_t) however we missed populating them (by saving/restoring them across signal handling). This patch fixes the issue by - adding arcv2 ABI regs to kernel struct sigcontext - populating them during signal handling Change to struct sigcontext might seem like a glibc ABI change (although it primarily uses ucontext_t:mcontext_t) but the fact is - it has only been extended (existing fields are not touched) - the old sigcontext was ABI incomplete to begin with anyways Fixes: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/53 Cc: Tested-by: kernel test robot Reported-by: Vladimir Isaev Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/include/uapi/asm/sigcontext.h | 1 + arch/arc/kernel/signal.c | 43 ++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/arch/arc/include/uapi/asm/sigcontext.h b/arch/arc/include/uapi/asm/sigcontext.h index 95f8a4380e11..7a5449dfcb29 100644 --- a/arch/arc/include/uapi/asm/sigcontext.h +++ b/arch/arc/include/uapi/asm/sigcontext.h @@ -18,6 +18,7 @@ */ struct sigcontext { struct user_regs_struct regs; + struct user_regs_arcv2 v2abi; }; #endif /* _ASM_ARC_SIGCONTEXT_H */ diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index da243420bcb5..68901f6f18ba 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -64,6 +64,41 @@ struct rt_sigframe { unsigned int sigret_magic; }; +static int save_arcv2_regs(struct sigcontext *mctx, struct pt_regs *regs) +{ + int err = 0; +#ifndef CONFIG_ISA_ARCOMPACT + struct user_regs_arcv2 v2abi; + + v2abi.r30 = regs->r30; +#ifdef CONFIG_ARC_HAS_ACCL_REGS + v2abi.r58 = regs->r58; + v2abi.r59 = regs->r59; +#else + v2abi.r58 = v2abi.r59 = 0; +#endif + err = __copy_to_user(&mctx->v2abi, &v2abi, sizeof(v2abi)); +#endif + return err; +} + +static int restore_arcv2_regs(struct sigcontext *mctx, struct pt_regs *regs) +{ + int err = 0; +#ifndef CONFIG_ISA_ARCOMPACT + struct user_regs_arcv2 v2abi; + + err = __copy_from_user(&v2abi, &mctx->v2abi, sizeof(v2abi)); + + regs->r30 = v2abi.r30; +#ifdef CONFIG_ARC_HAS_ACCL_REGS + regs->r58 = v2abi.r58; + regs->r59 = v2abi.r59; +#endif +#endif + return err; +} + static int stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, sigset_t *set) @@ -97,6 +132,10 @@ stash_usr_regs(struct rt_sigframe __user *sf, struct pt_regs *regs, err = __copy_to_user(&(sf->uc.uc_mcontext.regs.scratch), &uregs.scratch, sizeof(sf->uc.uc_mcontext.regs.scratch)); + + if (is_isa_arcv2()) + err |= save_arcv2_regs(&(sf->uc.uc_mcontext), regs); + err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(sigset_t)); return err ? -EFAULT : 0; @@ -112,6 +151,10 @@ static int restore_usr_regs(struct pt_regs *regs, struct rt_sigframe __user *sf) err |= __copy_from_user(&uregs.scratch, &(sf->uc.uc_mcontext.regs.scratch), sizeof(sf->uc.uc_mcontext.regs.scratch)); + + if (is_isa_arcv2()) + err |= restore_arcv2_regs(&(sf->uc.uc_mcontext), regs); + if (err) return -EFAULT; From d7899e123882b551bcd8e220aabae70dda93f8c2 Mon Sep 17 00:00:00 2001 From: Bumyong Lee Date: Fri, 7 May 2021 15:36:47 +0900 Subject: [PATCH 056/116] dmaengine: pl330: fix wrong usage of spinlock flags in dma_cyclc commit 4ad5dd2d7876d79507a20f026507d1a93b8fff10 upstream. flags varible which is the input parameter of pl330_prep_dma_cyclic() should not be used by spinlock_irq[save/restore] function. Signed-off-by: Jongho Park Signed-off-by: Bumyong Lee Signed-off-by: Chanho Park Link: https://lore.kernel.org/r/20210507063647.111209-1-chanho61.park@samsung.com Fixes: f6f2421c0a1c ("dmaengine: pl330: Merge dma_pl330_dmac and pl330_dmac structs") Cc: stable@vger.kernel.org Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/pl330.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index c034f506e015..a8cea236099a 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -2563,13 +2563,15 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic( for (i = 0; i < len / period_len; i++) { desc = pl330_get_desc(pch); if (!desc) { + unsigned long iflags; + dev_err(pch->dmac->ddma.dev, "%s:%d Unable to fetch desc\n", __func__, __LINE__); if (!first) return NULL; - spin_lock_irqsave(&pl330->pool_lock, flags); + spin_lock_irqsave(&pl330->pool_lock, iflags); while (!list_empty(&first->node)) { desc = list_entry(first->node.next, @@ -2579,7 +2581,7 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic( list_move_tail(&first->node, &pl330->desc_pool); - spin_unlock_irqrestore(&pl330->pool_lock, flags); + spin_unlock_irqrestore(&pl330->pool_lock, iflags); return NULL; } From ad7feefe7164892db424c45687472db803d87f79 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 10 Jun 2021 15:04:10 +0300 Subject: [PATCH 057/116] net: bridge: fix vlan tunnel dst null pointer dereference commit 58e2071742e38f29f051b709a5cca014ba51166f upstream. This patch fixes a tunnel_dst null pointer dereference due to lockless access in the tunnel egress path. When deleting a vlan tunnel the tunnel_dst pointer is set to NULL without waiting a grace period (i.e. while it's still usable) and packets egressing are dereferencing it without checking. Use READ/WRITE_ONCE to annotate the lockless use of tunnel_id, use RCU for accessing tunnel_dst and make sure it is read only once and checked in the egress path. The dst is already properly RCU protected so we don't need to do anything fancy than to make sure tunnel_id and tunnel_dst are read only once and checked in the egress path. Cc: stable@vger.kernel.org Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_private.h | 4 ++-- net/bridge/br_vlan_tunnel.c | 38 +++++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 14ff034e561c..50a55553a25c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -93,8 +93,8 @@ struct br_vlan_stats { }; struct br_tunnel_info { - __be64 tunnel_id; - struct metadata_dst *tunnel_dst; + __be64 tunnel_id; + struct metadata_dst __rcu *tunnel_dst; }; /** diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6d2c4eed2dc8..4d5100677c68 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -46,26 +46,33 @@ static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, br_vlan_tunnel_rht_params); } +static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) +{ + struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); + + WRITE_ONCE(vlan->tinfo.tunnel_id, 0); + RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); + dst_release(&tdst->dst); +} + void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { - if (!vlan->tinfo.tunnel_dst) + if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); - vlan->tinfo.tunnel_id = 0; - dst_release(&vlan->tinfo.tunnel_dst->dst); - vlan->tinfo.tunnel_dst = NULL; + vlan_tunnel_info_release(vlan); } static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { - struct metadata_dst *metadata = NULL; + struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); int err; - if (vlan->tinfo.tunnel_dst) + if (metadata) return -EEXIST; metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, @@ -74,8 +81,8 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, return -EINVAL; metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; - vlan->tinfo.tunnel_dst = metadata; - vlan->tinfo.tunnel_id = key; + rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); + WRITE_ONCE(vlan->tinfo.tunnel_id, key); err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); @@ -84,9 +91,7 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, return 0; out: - dst_release(&vlan->tinfo.tunnel_dst->dst); - vlan->tinfo.tunnel_dst = NULL; - vlan->tinfo.tunnel_id = 0; + vlan_tunnel_info_release(vlan); return err; } @@ -186,12 +191,15 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { + struct metadata_dst *tunnel_dst; + __be64 tunnel_id; int err; - if (!vlan || !vlan->tinfo.tunnel_id) + if (!vlan) return 0; - if (unlikely(!skb_vlan_tag_present(skb))) + tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); + if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0; skb_dst_drop(skb); @@ -199,7 +207,9 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err; - skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst)); + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); + if (tunnel_dst) + skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); return 0; } From 42020f7f37a90d24b9551f5f7eba3f7c7c102968 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 10 Jun 2021 15:04:11 +0300 Subject: [PATCH 058/116] net: bridge: fix vlan tunnel dst refcnt when egressing commit cfc579f9d89af4ada58c69b03bcaa4887840f3b3 upstream. The egress tunnel code uses dst_clone() and directly sets the result which is wrong because the entry might have 0 refcnt or be already deleted, causing number of problems. It also triggers the WARN_ON() in dst_hold()[1] when a refcnt couldn't be taken. Fix it by using dst_hold_safe() and checking if a reference was actually taken before setting the dst. [1] dmesg WARN_ON log and following refcnt errors WARNING: CPU: 5 PID: 38 at include/net/dst.h:230 br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] Modules linked in: 8021q garp mrp bridge stp llc bonding ipv6 virtio_net CPU: 5 PID: 38 Comm: ksoftirqd/5 Kdump: loaded Tainted: G W 5.13.0-rc3+ #360 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] Code: e8 85 bc 01 e1 45 84 f6 74 90 45 31 f6 85 db 48 c7 c7 a0 02 19 a0 41 0f 94 c6 31 c9 31 d2 44 89 f6 e8 64 bc 01 e1 85 db 75 02 <0f> 0b 31 c9 31 d2 44 89 f6 48 c7 c7 70 02 19 a0 e8 4b bc 01 e1 49 RSP: 0018:ffff8881003d39e8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffa01902a0 RBP: ffff8881040c6700 R08: 0000000000000000 R09: 0000000000000001 R10: 2ce93d0054fe0d00 R11: 54fe0d00000e0000 R12: ffff888109515000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000401 FS: 0000000000000000(0000) GS:ffff88822bf40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f42ba70f030 CR3: 0000000109926000 CR4: 00000000000006e0 Call Trace: br_handle_vlan+0xbc/0xca [bridge] __br_forward+0x23/0x164 [bridge] deliver_clone+0x41/0x48 [bridge] br_handle_frame_finish+0x36f/0x3aa [bridge] ? skb_dst+0x2e/0x38 [bridge] ? br_handle_ingress_vlan_tunnel+0x3e/0x1c8 [bridge] ? br_handle_frame_finish+0x3aa/0x3aa [bridge] br_handle_frame+0x2c3/0x377 [bridge] ? __skb_pull+0x33/0x51 ? vlan_do_receive+0x4f/0x36a ? br_handle_frame_finish+0x3aa/0x3aa [bridge] __netif_receive_skb_core+0x539/0x7c6 ? __list_del_entry_valid+0x16e/0x1c2 __netif_receive_skb_list_core+0x6d/0xd6 netif_receive_skb_list_internal+0x1d9/0x1fa gro_normal_list+0x22/0x3e dev_gro_receive+0x55b/0x600 ? detach_buf_split+0x58/0x140 napi_gro_receive+0x94/0x12e virtnet_poll+0x15d/0x315 [virtio_net] __napi_poll+0x2c/0x1c9 net_rx_action+0xe6/0x1fb __do_softirq+0x115/0x2d8 run_ksoftirqd+0x18/0x20 smpboot_thread_fn+0x183/0x19c ? smpboot_unregister_percpu_thread+0x66/0x66 kthread+0x10a/0x10f ? kthread_mod_delayed_work+0xb6/0xb6 ret_from_fork+0x22/0x30 ---[ end trace 49f61b07f775fd2b ]--- dst_release: dst:00000000c02d677a refcnt:-1 dst_release underflow Cc: stable@vger.kernel.org Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_vlan_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 4d5100677c68..adb6845ceba4 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -208,8 +208,8 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, return err; tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); - if (tunnel_dst) - skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); + if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) + skb_dst_set(skb, &tunnel_dst->dst); return 0; } From 44a05a27ff5585dc1ff1b457675842f65be61246 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 15 Jun 2021 18:23:39 -0700 Subject: [PATCH 059/116] mm/slub.c: include swab.h commit 1b3865d016815cbd69a1879ca1c8a8901fda1072 upstream. Fixes build with CONFIG_SLAB_FREELIST_HARDENED=y. Hopefully. But it's the right thing to do anwyay. Fixes: 1ad53d9fa3f61 ("slub: improve bit diffusion for freelist ptr obfuscation") Link: https://bugzilla.kernel.org/show_bug.cgi?id=213417 Reported-by: Acked-by: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/slub.c b/mm/slub.c index a0cb3568b0b5..484a75296a12 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include "slab.h" From fcb34a99ed2a334292419881f65bf68aab6eb39c Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Wed, 16 Jun 2021 17:14:25 +0800 Subject: [PATCH 060/116] net: fec_ptp: add clock rate zero check commit cb3cefe3f3f8af27c6076ef7d1f00350f502055d upstream. Add clock rate zero check to fix coverity issue of "divide by 0". Fixes: commit 85bd1798b24a ("net: fec: fix spin_lock dead lock") Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/fec_ptp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index e63df6455fba..40c5c09f60dc 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -586,6 +586,10 @@ void fec_ptp_init(struct platform_device *pdev) fep->ptp_caps.enable = fec_ptp_enable; fep->cycle_speed = clk_get_rate(fep->clk_ptp); + if (!fep->cycle_speed) { + fep->cycle_speed = NSEC_PER_SEC; + dev_err(&fep->pdev->dev, "clk_ptp clock rate is zero\n"); + } fep->ptp_inc = NSEC_PER_SEC / fep->cycle_speed; spin_lock_init(&fep->tmreg_lock); From ec38df244597b0b798b455aface83b84893e4939 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 5 Jun 2021 19:26:35 +0900 Subject: [PATCH 061/116] can: bcm/raw/isotp: use per module netdevice notifier commit 8d0caedb759683041d9db82069937525999ada53 upstream. syzbot is reporting hung task at register_netdevice_notifier() [1] and unregister_netdevice_notifier() [2], for cleanup_net() might perform time consuming operations while CAN driver's raw/bcm/isotp modules are calling {register,unregister}_netdevice_notifier() on each socket. Change raw/bcm/isotp modules to call register_netdevice_notifier() from module's __init function and call unregister_netdevice_notifier() from module's __exit function, as with gw/j1939 modules are doing. Link: https://syzkaller.appspot.com/bug?id=391b9498827788b3cc6830226d4ff5be87107c30 [1] Link: https://syzkaller.appspot.com/bug?id=1724d278c83ca6e6df100a2e320c10d991cf2bce [2] Link: https://lore.kernel.org/r/54a5f451-05ed-f977-8534-79e7aa2bcc8f@i-love.sakura.ne.jp Cc: linux-stable Reported-by: syzbot Reported-by: syzbot Reviewed-by: Kirill Tkhai Tested-by: syzbot Tested-by: Oliver Hartkopp Signed-off-by: Tetsuo Handa Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/bcm.c | 59 +++++++++++++++++++++++++++++++++++++----------- net/can/raw.c | 62 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 94 insertions(+), 27 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index f691afcc5b8b..8c8b02e54432 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -125,7 +125,7 @@ struct bcm_sock { struct sock sk; int bound; int ifindex; - struct notifier_block notifier; + struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; @@ -133,6 +133,10 @@ struct bcm_sock { char procname [32]; /* inode number in decimal with \0 */ }; +static LIST_HEAD(bcm_notifier_list); +static DEFINE_SPINLOCK(bcm_notifier_lock); +static struct bcm_sock *bcm_busy_notifier; + static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; @@ -1442,20 +1446,15 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* * notification handler for netdevice status changes */ -static int bcm_notifier(struct notifier_block *nb, unsigned long msg, - void *ptr) +static void bcm_notify(struct bcm_sock *bo, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; switch (msg) { @@ -1490,7 +1489,28 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg, sk->sk_error_report(sk); } } +} +static int bcm_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + + spin_lock(&bcm_notifier_lock); + list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { + spin_unlock(&bcm_notifier_lock); + bcm_notify(bcm_busy_notifier, msg, dev); + spin_lock(&bcm_notifier_lock); + } + bcm_busy_notifier = NULL; + spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } @@ -1510,9 +1530,9 @@ static int bcm_init(struct sock *sk) INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ - bo->notifier.notifier_call = bcm_notifier; - - register_netdevice_notifier(&bo->notifier); + spin_lock(&bcm_notifier_lock); + list_add_tail(&bo->notifier, &bcm_notifier_list); + spin_unlock(&bcm_notifier_lock); return 0; } @@ -1535,7 +1555,14 @@ static int bcm_release(struct socket *sock) /* remove bcm_ops, timer, rx_unregister(), etc. */ - unregister_netdevice_notifier(&bo->notifier); + spin_lock(&bcm_notifier_lock); + while (bcm_busy_notifier == bo) { + spin_unlock(&bcm_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&bcm_notifier_lock); + } + list_del(&bo->notifier); + spin_unlock(&bcm_notifier_lock); lock_sock(sk); @@ -1750,6 +1777,10 @@ static struct pernet_operations canbcm_pernet_ops __read_mostly = { .exit = canbcm_pernet_exit, }; +static struct notifier_block canbcm_notifier = { + .notifier_call = bcm_notifier +}; + static int __init bcm_module_init(void) { int err; @@ -1763,12 +1794,14 @@ static int __init bcm_module_init(void) } register_pernet_subsys(&canbcm_pernet_ops); + register_netdevice_notifier(&canbcm_notifier); return 0; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); + unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } diff --git a/net/can/raw.c b/net/can/raw.c index e1f26441b49a..24af08164b61 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -84,7 +84,7 @@ struct raw_sock { struct sock sk; int bound; int ifindex; - struct notifier_block notifier; + struct list_head notifier; int loopback; int recv_own_msgs; int fd_frames; @@ -96,6 +96,10 @@ struct raw_sock { struct uniqframe __percpu *uniq; }; +static LIST_HEAD(raw_notifier_list); +static DEFINE_SPINLOCK(raw_notifier_lock); +static struct raw_sock *raw_busy_notifier; + /* * Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' @@ -266,21 +270,16 @@ static int raw_enable_allfilters(struct net *net, struct net_device *dev, return err; } -static int raw_notifier(struct notifier_block *nb, - unsigned long msg, void *ptr) +static void raw_notify(struct raw_sock *ro, unsigned long msg, + struct net_device *dev) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); struct sock *sk = &ro->sk; if (!net_eq(dev_net(dev), sock_net(sk))) - return NOTIFY_DONE; - - if (dev->type != ARPHRD_CAN) - return NOTIFY_DONE; + return; if (ro->ifindex != dev->ifindex) - return NOTIFY_DONE; + return; switch (msg) { @@ -309,7 +308,28 @@ static int raw_notifier(struct notifier_block *nb, sk->sk_error_report(sk); break; } +} + +static int raw_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) + return NOTIFY_DONE; + if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */ + return NOTIFY_DONE; + spin_lock(&raw_notifier_lock); + list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) { + spin_unlock(&raw_notifier_lock); + raw_notify(raw_busy_notifier, msg, dev); + spin_lock(&raw_notifier_lock); + } + raw_busy_notifier = NULL; + spin_unlock(&raw_notifier_lock); return NOTIFY_DONE; } @@ -338,9 +358,9 @@ static int raw_init(struct sock *sk) return -ENOMEM; /* set notifier */ - ro->notifier.notifier_call = raw_notifier; - - register_netdevice_notifier(&ro->notifier); + spin_lock(&raw_notifier_lock); + list_add_tail(&ro->notifier, &raw_notifier_list); + spin_unlock(&raw_notifier_lock); return 0; } @@ -355,7 +375,14 @@ static int raw_release(struct socket *sock) ro = raw_sk(sk); - unregister_netdevice_notifier(&ro->notifier); + spin_lock(&raw_notifier_lock); + while (raw_busy_notifier == ro) { + spin_unlock(&raw_notifier_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&raw_notifier_lock); + } + list_del(&ro->notifier); + spin_unlock(&raw_notifier_lock); lock_sock(sk); @@ -870,6 +897,10 @@ static const struct can_proto raw_can_proto = { .prot = &raw_proto, }; +static struct notifier_block canraw_notifier = { + .notifier_call = raw_notifier +}; + static __init int raw_module_init(void) { int err; @@ -879,6 +910,8 @@ static __init int raw_module_init(void) err = can_proto_register(&raw_can_proto); if (err < 0) printk(KERN_ERR "can: registration of raw protocol failed\n"); + else + register_netdevice_notifier(&canraw_notifier); return err; } @@ -886,6 +919,7 @@ static __init int raw_module_init(void) static __exit void raw_module_exit(void) { can_proto_unregister(&raw_can_proto); + unregister_netdevice_notifier(&canraw_notifier); } module_init(raw_module_init); From 3ba51ed2c3ac36aa947d0b250d318de6ed7cf552 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Mar 2021 14:53:37 -0700 Subject: [PATCH 062/116] inet: use bigger hash table for IP ID generation commit aa6dd211e4b1dde9d5dc25d699d35f789ae7eeba upstream. In commit 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") I used a very small hash table that could be abused by patient attackers to reveal sensitive information. Switch to a dynamic sizing, depending on RAM size. Typical big hosts will now use 128x more storage (2 MB) to get a similar increase in security and reduction of hash collisions. As a bonus, use of alloc_large_system_hash() spreads allocated memory among all NUMA nodes. Fixes: 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") Reported-by: Amit Klein Signed-off-by: Eric Dumazet Cc: Willy Tarreau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/route.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 78d6bc61a1d8..81901b052907 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -485,8 +486,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } -#define IP_IDENTS_SZ 2048u - +/* Hash tables of size 2048..262144 depending on RAM size. + * Each bucket uses 8 bytes. + */ +static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; @@ -496,12 +499,16 @@ static u32 *ip_tstamps __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); - u32 now = (u32)jiffies; + u32 bucket, old, now = (u32)jiffies; + atomic_t *p_id; + u32 *p_tstamp; u32 delta = 0; + bucket = hash & ip_idents_mask; + p_tstamp = ip_tstamps + bucket; + p_id = ip_idents + bucket; + old = ACCESS_ONCE(*p_tstamp); + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); @@ -3098,18 +3105,26 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { + void *idents_hash; int rc = 0; int cpu; - ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); - if (!ip_idents) - panic("IP: failed to allocate ip_idents\n"); + /* For modern hosts, this will use 2 MB of memory */ + idents_hash = alloc_large_system_hash("IP idents", + sizeof(*ip_idents) + sizeof(*ip_tstamps), + 0, + 16, /* one bucket per 64 KB */ + HASH_ZERO, + NULL, + &ip_idents_mask, + 2048, + 256*1024); + + ip_idents = idents_hash; - prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); - ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) - panic("IP: failed to allocate ip_tstamps\n"); + ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); From 58b5e02c6ca0e2b7c87cd8023ff786ef3c0eef74 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Tue, 8 Jun 2021 18:56:56 +0800 Subject: [PATCH 063/116] usb: dwc3: core: fix kernel panic when do reboot commit 4bf584a03eec674975ee9fe36c8583d9d470dab1 upstream. When do system reboot, it calls dwc3_shutdown and the whole debugfs for dwc3 has removed first, when the gadget tries to do deinit, and remove debugfs for its endpoints, it meets NULL pointer dereference issue when call debugfs_lookup. Fix it by removing the whole dwc3 debugfs later than dwc3_drd_exit. [ 2924.958838] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000002 .... [ 2925.030994] pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--) [ 2925.037005] pc : inode_permission+0x2c/0x198 [ 2925.041281] lr : lookup_one_len_common+0xb0/0xf8 [ 2925.045903] sp : ffff80001276ba70 [ 2925.049218] x29: ffff80001276ba70 x28: ffff0000c01f0000 x27: 0000000000000000 [ 2925.056364] x26: ffff800011791e70 x25: 0000000000000008 x24: dead000000000100 [ 2925.063510] x23: dead000000000122 x22: 0000000000000000 x21: 0000000000000001 [ 2925.070652] x20: ffff8000122c6188 x19: 0000000000000000 x18: 0000000000000000 [ 2925.077797] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000004 [ 2925.084943] x14: ffffffffffffffff x13: 0000000000000000 x12: 0000000000000030 [ 2925.092087] x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f x9 : ffff8000102b2420 [ 2925.099232] x8 : 7f7f7f7f7f7f7f7f x7 : feff73746e2f6f64 x6 : 0000000000008080 [ 2925.106378] x5 : 61c8864680b583eb x4 : 209e6ec2d263dbb7 x3 : 000074756f307065 [ 2925.113523] x2 : 0000000000000001 x1 : 0000000000000000 x0 : ffff8000122c6188 [ 2925.120671] Call trace: [ 2925.123119] inode_permission+0x2c/0x198 [ 2925.127042] lookup_one_len_common+0xb0/0xf8 [ 2925.131315] lookup_one_len_unlocked+0x34/0xb0 [ 2925.135764] lookup_positive_unlocked+0x14/0x50 [ 2925.140296] debugfs_lookup+0x68/0xa0 [ 2925.143964] dwc3_gadget_free_endpoints+0x84/0xb0 [ 2925.148675] dwc3_gadget_exit+0x28/0x78 [ 2925.152518] dwc3_drd_exit+0x100/0x1f8 [ 2925.156267] dwc3_remove+0x11c/0x120 [ 2925.159851] dwc3_shutdown+0x14/0x20 [ 2925.163432] platform_shutdown+0x28/0x38 [ 2925.167360] device_shutdown+0x15c/0x378 [ 2925.171291] kernel_restart_prepare+0x3c/0x48 [ 2925.175650] kernel_restart+0x1c/0x68 [ 2925.179316] __do_sys_reboot+0x218/0x240 [ 2925.183247] __arm64_sys_reboot+0x28/0x30 [ 2925.187262] invoke_syscall+0x48/0x100 [ 2925.191017] el0_svc_common.constprop.0+0x48/0xc8 [ 2925.195726] do_el0_svc+0x28/0x88 [ 2925.199045] el0_svc+0x20/0x30 [ 2925.202104] el0_sync_handler+0xa8/0xb0 [ 2925.205942] el0_sync+0x148/0x180 [ 2925.209270] Code: a9025bf5 2a0203f5 121f0056 370802b5 (79400660) [ 2925.215372] ---[ end trace 124254d8e485a58b ]--- [ 2925.220012] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b [ 2925.227676] Kernel Offset: disabled [ 2925.231164] CPU features: 0x00001001,20000846 [ 2925.235521] Memory Limit: none [ 2925.238580] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- Fixes: 8d396bb0a5b6 ("usb: dwc3: debugfs: Add and remove endpoint dirs dynamically") Cc: Jack Pham Tested-by: Jack Pham Signed-off-by: Peter Chen Link: https://lore.kernel.org/r/20210608105656.10795-1-peter.chen@kernel.org (cherry picked from commit 2a042767814bd0edf2619f06fecd374e266ea068) Link: https://lore.kernel.org/r/20210615080847.GA10432@jackp-linux.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index f3fca4537d25..0e179274ba7c 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1337,8 +1337,8 @@ static int dwc3_remove(struct platform_device *pdev) */ res->start -= DWC3_GLOBALS_REGS_START; - dwc3_debugfs_exit(dwc); dwc3_core_exit_mode(dwc); + dwc3_debugfs_exit(dwc); dwc3_core_exit(dwc); dwc3_ulpi_exit(dwc); From e83ca3bdb895e581a6c541515e9fd2cfb9e60ed4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Apr 2018 00:22:29 -0400 Subject: [PATCH 064/116] kernfs: deal with kernfs_fill_super() failures commit 82382acec0c97b91830fff7130d0acce4ac4f3f3 upstream. make sure that info->node is initialized early, so that kernfs_kill_sb() can list_del() it safely. Signed-off-by: Al Viro Signed-off-by: Guilherme G. Piccoli Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/mount.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 5019058e0f6a..610267585f8f 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -320,6 +320,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; + INIT_LIST_HEAD(&info->node); sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, &init_user_ns, info); From 68fed0725820073d37e487bfa7eeb73eb0e63478 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 May 2018 00:03:34 -0400 Subject: [PATCH 065/116] unfuck sysfs_mount() commit 7b745a4e4051e1bbce40e0b1c2cf636c70583aa4 upstream. new_sb is left uninitialized in case of early failures in kernfs_mount_ns(), and while IS_ERR(root) is true in all such cases, using IS_ERR(root) || !new_sb is not a solution - IS_ERR(root) is true in some cases when new_sb is true. Make sure new_sb is initialized (and matches the reality) in all cases and fix the condition for dropping kobj reference - we want it done precisely in those situations where the reference has not been transferred into a new super_block instance. Signed-off-by: Al Viro Signed-off-by: Guilherme G. Piccoli Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/mount.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 20b8f82e115b..2bbe84d9c0a8 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -28,7 +28,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, { struct dentry *root; void *ns; - bool new_sb; + bool new_sb = false; if (!(flags & MS_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) @@ -38,9 +38,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); root = kernfs_mount_ns(fs_type, flags, sysfs_root, SYSFS_MAGIC, &new_sb, ns); - if (IS_ERR(root) || !new_sb) + if (!new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (new_sb) + else if (!IS_ERR(root)) root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; From 3267c5454e54c9678ebe0c483fdd56ee1b3ae9af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Jun 2021 21:18:00 +0200 Subject: [PATCH 066/116] x86/fpu: Reset state for all signal restore failures commit efa165504943f2128d50f63de0c02faf6dcceb0d upstream. If access_ok() or fpregs_soft_set() fails in __fpu__restore_sig() then the function just returns but does not clear the FPU state as it does for all other fatal failures. Clear the FPU state for these failures as well. Fixes: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87mtryyhhz.ffs@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/fpu/signal.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index d99a8ee9e185..86a231338bbf 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -272,6 +272,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int state_size = fpu_kernel_xstate_size; u64 xfeatures = 0; int fx_only = 0; + int ret = 0; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -281,15 +282,21 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) return 0; } - if (!access_ok(VERIFY_READ, buf, size)) - return -EACCES; + if (!access_ok(VERIFY_READ, buf, size)) { + ret = -EACCES; + goto out_err; + } fpu__initialize(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + if (!static_cpu_has(X86_FEATURE_FPU)) { + ret = fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + if (ret) + goto out_err; + return 0; + } if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -349,6 +356,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__restore(fpu); local_bh_enable(); + /* Failure is already handled */ return err; } else { /* @@ -356,13 +364,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * state to the registers directly (with exceptions handled). */ user_fpu_begin(); - if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { - fpu__clear(fpu); - return -1; - } + if (!copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) + return 0; + ret = -1; } - return 0; +out_err: + fpu__clear(fpu); + return ret; } static inline int xstate_sigframe_size(void) From 94fc53e655e465787d104ef7a4bd8970f7cd8bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 21 Jun 2021 13:36:35 +0200 Subject: [PATCH 067/116] drm/nouveau: wait for moving fence after pinning v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 17b11f71795abdce46f62a808f906857e525cea8 upstream. We actually need to wait for the moving fence after pinning the BO to make sure that the pin is completed. v2: grab the lock while waiting Signed-off-by: Christian König Reviewed-by: Daniel Vetter References: https://lore.kernel.org/dri-devel/20210621151758.2347474-1-daniel.vetter@ffwll.ch/ CC: stable@kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20210622114506.106349-1-christian.koenig@amd.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nouveau_prime.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_prime.c b/drivers/gpu/drm/nouveau/nouveau_prime.c index 1fefc93af1d7..bbfce7b9d03e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_prime.c +++ b/drivers/gpu/drm/nouveau/nouveau_prime.c @@ -98,7 +98,22 @@ int nouveau_gem_prime_pin(struct drm_gem_object *obj) if (ret) return -EINVAL; - return 0; + ret = ttm_bo_reserve(&nvbo->bo, false, false, NULL); + if (ret) + goto error; + + if (nvbo->bo.moving) + ret = dma_fence_wait(nvbo->bo.moving, true); + + ttm_bo_unreserve(&nvbo->bo); + if (ret) + goto error; + + return ret; + +error: + nouveau_bo_unpin(nvbo); + return ret; } void nouveau_gem_prime_unpin(struct drm_gem_object *obj) From 206c3f8c6996861e645b846a3d2f3b8b34838c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 21 Jun 2021 13:43:05 +0200 Subject: [PATCH 068/116] drm/radeon: wait for moving fence after pinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4b41726aae563273bb4b4a9462ba51ce4d372f78 upstream. We actually need to wait for the moving fence after pinning the BO to make sure that the pin is completed. Signed-off-by: Christian König Reviewed-by: Daniel Vetter References: https://lore.kernel.org/dri-devel/20210621151758.2347474-1-daniel.vetter@ffwll.ch/ CC: stable@kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20210622114506.106349-2-christian.koenig@amd.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_prime.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_prime.c b/drivers/gpu/drm/radeon/radeon_prime.c index 7110d403322c..c138e07f51a3 100644 --- a/drivers/gpu/drm/radeon/radeon_prime.c +++ b/drivers/gpu/drm/radeon/radeon_prime.c @@ -92,9 +92,19 @@ int radeon_gem_prime_pin(struct drm_gem_object *obj) /* pin buffer into GTT */ ret = radeon_bo_pin(bo, RADEON_GEM_DOMAIN_GTT, NULL); - if (likely(ret == 0)) - bo->prime_shared_count++; - + if (unlikely(ret)) + goto error; + + if (bo->tbo.moving) { + ret = dma_fence_wait(bo->tbo.moving, false); + if (unlikely(ret)) { + radeon_bo_unpin(bo); + goto error; + } + } + + bo->prime_shared_count++; +error: radeon_bo_unreserve(bo); return ret; } From af115fa3d6fc530f10f3045d889ee4488c756675 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 May 2021 11:26:37 +0100 Subject: [PATCH 069/116] ARM: 9081/1: fix gcc-10 thumb2-kernel regression commit dad7b9896a5dbac5da8275d5a6147c65c81fb5f2 upstream. When building the kernel wtih gcc-10 or higher using the CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y flag, the compiler picks a slightly different set of registers for the inline assembly in cpu_init() that subsequently results in a corrupt kernel stack as well as remaining in FIQ mode. If a banked register is used for the last argument, the wrong version of that register gets loaded into CPSR_c. When building in Arm mode, the arguments are passed as immediate values and the bug cannot happen. This got introduced when Daniel reworked the FIQ handling and was technically always broken, but happened to work with both clang and gcc before gcc-10 as long as they picked one of the lower registers. This is probably an indication that still very few people build the kernel in Thumb2 mode. Marek pointed out the problem on IRC, Arnd narrowed it down to this inline assembly and Russell pinpointed the exact bug. Change the constraints to force the final mode switch to use a non-banked register for the argument to ensure that the correct constant gets loaded. Another alternative would be to always use registers for the constant arguments to avoid the #ifdef that has now become more complex. Cc: # v3.18+ Cc: Daniel Thompson Reported-by: Marek Vasut Acked-by: Ard Biesheuvel Fixes: c0e7f7ee717e ("ARM: 8150/3: fiq: Replace default FIQ handler") Signed-off-by: Arnd Bergmann Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/setup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index a6d27284105a..ac4ffd97ae82 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -547,9 +547,11 @@ void notrace cpu_init(void) * In Thumb-2, msr with an immediate value is not allowed. */ #ifdef CONFIG_THUMB2_KERNEL -#define PLC "r" +#define PLC_l "l" +#define PLC_r "r" #else -#define PLC "I" +#define PLC_l "I" +#define PLC_r "I" #endif /* @@ -571,15 +573,15 @@ void notrace cpu_init(void) "msr cpsr_c, %9" : : "r" (stk), - PLC (PSR_F_BIT | PSR_I_BIT | IRQ_MODE), + PLC_r (PSR_F_BIT | PSR_I_BIT | IRQ_MODE), "I" (offsetof(struct stack, irq[0])), - PLC (PSR_F_BIT | PSR_I_BIT | ABT_MODE), + PLC_r (PSR_F_BIT | PSR_I_BIT | ABT_MODE), "I" (offsetof(struct stack, abt[0])), - PLC (PSR_F_BIT | PSR_I_BIT | UND_MODE), + PLC_r (PSR_F_BIT | PSR_I_BIT | UND_MODE), "I" (offsetof(struct stack, und[0])), - PLC (PSR_F_BIT | PSR_I_BIT | FIQ_MODE), + PLC_r (PSR_F_BIT | PSR_I_BIT | FIQ_MODE), "I" (offsetof(struct stack, fiq[0])), - PLC (PSR_F_BIT | PSR_I_BIT | SVC_MODE) + PLC_l (PSR_F_BIT | PSR_I_BIT | SVC_MODE) : "r14"); #endif } From 8e64fac312b258681988d0330cad911fa27c980f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 28 Apr 2021 18:23:50 -0700 Subject: [PATCH 070/116] Makefile: Move -Wno-unused-but-set-variable out of GCC only block commit 885480b084696331bea61a4f7eba10652999a9c1 upstream. Currently, -Wunused-but-set-variable is only supported by GCC so it is disabled unconditionally in a GCC only block (it is enabled with W=1). clang currently has its implementation for this warning in review so preemptively move this statement out of the GCC only block and wrap it with cc-disable-warning so that both compilers function the same. Cc: stable@vger.kernel.org Link: https://reviews.llvm.org/D100581 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Signed-off-by: Masahiro Yamada [nc: Backport, workaround lack of e2079e93f562 in older branches] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index bc9833fdca1a..e2977b8f8e6a 100644 --- a/Makefile +++ b/Makefile @@ -716,12 +716,11 @@ KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) # See modpost pattern 2 KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) -else +endif # These warnings generated too much noise in a regular build. # Use make W=1 to enable them (see scripts/Makefile.extrawarn) KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) -endif KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) ifdef CONFIG_FRAME_POINTER From f43d0bcb387fdd87c516bd58ce4fd01c0f689035 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 9 Apr 2021 12:21:28 -0700 Subject: [PATCH 071/116] MIPS: generic: Update node names to avoid unit addresses commit e607ff630c6053ecc67502677c0e50053d7892d4 upstream. With the latest mkimage from U-Boot 2021.04, the generic defconfigs no longer build, failing with: /usr/bin/mkimage: verify_header failed for FIT Image support with exit code 1 This is expected after the linked U-Boot commits because '@' is forbidden in the node names due to the way that libfdt treats nodes with the same prefix but different unit addresses. Switch the '@' in the node name to '-'. Drop the unit addresses from the hash and kernel child nodes because there is only one node so they do not need to have a number to differentiate them. Cc: stable@vger.kernel.org Link: https://source.denx.de/u-boot/u-boot/-/commit/79af75f7776fc20b0d7eb6afe1e27c00fdb4b9b4 Link: https://source.denx.de/u-boot/u-boot/-/commit/3f04db891a353f4b127ed57279279f851c6b4917 Suggested-by: Simon Glass Signed-off-by: Nathan Chancellor Reviewed-by: Tom Rini Signed-off-by: Thomas Bogendoerfer [nathan: Backport to 4.14, only apply to .its.S files that exist] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- arch/mips/generic/board-boston.its.S | 10 +++++----- arch/mips/generic/board-ni169445.its.S | 10 +++++----- arch/mips/generic/vmlinux.its.S | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/mips/generic/board-boston.its.S b/arch/mips/generic/board-boston.its.S index a7f51f97b910..c45ad2759421 100644 --- a/arch/mips/generic/board-boston.its.S +++ b/arch/mips/generic/board-boston.its.S @@ -1,22 +1,22 @@ / { images { - fdt@boston { + fdt-boston { description = "img,boston Device Tree"; data = /incbin/("boot/dts/img/boston.dtb"); type = "flat_dt"; arch = "mips"; compression = "none"; - hash@0 { + hash { algo = "sha1"; }; }; }; configurations { - conf@boston { + conf-boston { description = "Boston Linux kernel"; - kernel = "kernel@0"; - fdt = "fdt@boston"; + kernel = "kernel"; + fdt = "fdt-boston"; }; }; }; diff --git a/arch/mips/generic/board-ni169445.its.S b/arch/mips/generic/board-ni169445.its.S index e4cb4f95a8cc..0a2e8f7a8526 100644 --- a/arch/mips/generic/board-ni169445.its.S +++ b/arch/mips/generic/board-ni169445.its.S @@ -1,22 +1,22 @@ / { images { - fdt@ni169445 { + fdt-ni169445 { description = "NI 169445 device tree"; data = /incbin/("boot/dts/ni/169445.dtb"); type = "flat_dt"; arch = "mips"; compression = "none"; - hash@0 { + hash { algo = "sha1"; }; }; }; configurations { - conf@ni169445 { + conf-ni169445 { description = "NI 169445 Linux Kernel"; - kernel = "kernel@0"; - fdt = "fdt@ni169445"; + kernel = "kernel"; + fdt = "fdt-ni169445"; }; }; }; diff --git a/arch/mips/generic/vmlinux.its.S b/arch/mips/generic/vmlinux.its.S index 1a08438fd893..3e254676540f 100644 --- a/arch/mips/generic/vmlinux.its.S +++ b/arch/mips/generic/vmlinux.its.S @@ -6,7 +6,7 @@ #address-cells = ; images { - kernel@0 { + kernel { description = KERNEL_NAME; data = /incbin/(VMLINUX_BINARY); type = "kernel"; @@ -15,18 +15,18 @@ compression = VMLINUX_COMPRESSION; load = /bits/ ADDR_BITS ; entry = /bits/ ADDR_BITS ; - hash@0 { + hash { algo = "sha1"; }; }; }; configurations { - default = "conf@default"; + default = "conf-default"; - conf@default { + conf-default { description = "Generic Linux kernel"; - kernel = "kernel@0"; + kernel = "kernel"; }; }; }; From d4f08316ad339b7c3b0fd55c2947f4c320ee32d0 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 10 Jul 2018 09:58:03 +0100 Subject: [PATCH 072/116] arm64: perf: Disable PMU while processing counter overflows commit 3cce50dfec4a5b0414c974190940f47dd32c6dee upstream. The arm64 PMU updates the event counters and reprograms the counters in the overflow IRQ handler without disabling the PMU. This could potentially cause skews in for group counters, where the overflowed counters may potentially loose some event counts, while they are reprogrammed. To prevent this, disable the PMU while we process the counter overflows and enable it right back when we are done. This patch also moves the PMU stop/start routines to avoid a forward declaration. Suggested-by: Mark Rutland Cc: Will Deacon Acked-by: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon Signed-off-by: Aman Priyadarshi Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/perf_event.c | 50 +++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 53df84b2a07f..4ee1228d29eb 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -670,6 +670,28 @@ static void armv8pmu_disable_event(struct perf_event *event) raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } +static void armv8pmu_start(struct arm_pmu *cpu_pmu) +{ + unsigned long flags; + struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + + raw_spin_lock_irqsave(&events->pmu_lock, flags); + /* Enable all counters */ + armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void armv8pmu_stop(struct arm_pmu *cpu_pmu) +{ + unsigned long flags; + struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + + raw_spin_lock_irqsave(&events->pmu_lock, flags); + /* Disable all counters */ + armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) { u32 pmovsr; @@ -695,6 +717,11 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); + /* + * Stop the PMU while processing the counter overflows + * to prevent skews in group events. + */ + armv8pmu_stop(cpu_pmu); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -719,6 +746,7 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) if (perf_event_overflow(event, &data, regs)) cpu_pmu->disable(event); } + armv8pmu_start(cpu_pmu); /* * Handle the pending perf events. @@ -732,28 +760,6 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) return IRQ_HANDLED; } -static void armv8pmu_start(struct arm_pmu *cpu_pmu) -{ - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Enable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - -static void armv8pmu_stop(struct arm_pmu *cpu_pmu) -{ - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* Disable all counters */ - armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); -} - static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { From e9695e4d26d210b81c9c8a11f6983c1190a3a9cc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 22 Jun 2021 17:35:18 +0200 Subject: [PATCH 073/116] Revert "PCI: PM: Do not read power state in pci_enable_device_flags()" [ Upstream commit 4d6035f9bf4ea12776322746a216e856dfe46698 ] Revert commit 4514d991d992 ("PCI: PM: Do not read power state in pci_enable_device_flags()") that is reported to cause PCI device initialization issues on some systems. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=213481 Link: https://lore.kernel.org/linux-acpi/YNDoGICcg0V8HhpQ@eldamar.lan Reported-by: Michael Reported-by: Salvatore Bonaccorso Fixes: 4514d991d992 ("PCI: PM: Do not read power state in pci_enable_device_flags()") Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/pci/pci.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 1993e5e28ea7..c847b5554db6 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1378,11 +1378,21 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) int err; int i, bars = 0; - if (atomic_inc_return(&dev->enable_cnt) > 1) { - pci_update_current_state(dev, dev->current_state); - return 0; /* already enabled */ + /* + * Power state could be unknown at this point, either due to a fresh + * boot or a device removal call. So get the current power state + * so that things like MSI message writing will behave as expected + * (e.g. if the device really is in D0 at enable time). + */ + if (dev->pm_cap) { + u16 pmcsr; + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); } + if (atomic_inc_return(&dev->enable_cnt) > 1) + return 0; /* already enabled */ + bridge = pci_upstream_bridge(dev); if (bridge) pci_enable_bridge(bridge); From a7abf2f48669fafe7324bf0ad19bd0d9c88d0ce1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 May 2021 16:47:17 +0200 Subject: [PATCH 074/116] mac80211: remove warning in ieee80211_get_sband() [ Upstream commit 0ee4d55534f82a0624701d0bb9fc2304d4529086 ] Syzbot reports that it's possible to hit this from userspace, by trying to add a station before any other connection setup has been done. Instead of trying to catch this in some other way simply remove the warning, that will appropriately reject the call from userspace. Reported-by: syzbot+7716dbc401d9a437890d@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20210517164715.f537da276d17.Id05f40ec8761d6a8cc2df87f1aa09c651988a586@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/ieee80211_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 790c771e8108..0d4f7258b243 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1393,7 +1393,7 @@ ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (WARN_ON_ONCE(!chanctx_conf)) { + if (!chanctx_conf) { rcu_read_unlock(); return NULL; } From cb44de1ae03374b9d5c5b87247cef35f0eefdc04 Mon Sep 17 00:00:00 2001 From: Du Cheng Date: Wed, 28 Apr 2021 14:39:41 +0800 Subject: [PATCH 075/116] cfg80211: call cfg80211_leave_ocb when switching away from OCB [ Upstream commit a64b6a25dd9f984ed05fade603a00e2eae787d2f ] If the userland switches back-and-forth between NL80211_IFTYPE_OCB and NL80211_IFTYPE_ADHOC via send_msg(NL80211_CMD_SET_INTERFACE), there is a chance where the cleanup cfg80211_leave_ocb() is not called. This leads to initialization of in-use memory (e.g. init u.ibss while in-use by u.ocb) due to a shared struct/union within ieee80211_sub_if_data: struct ieee80211_sub_if_data { ... union { struct ieee80211_if_ap ap; struct ieee80211_if_vlan vlan; struct ieee80211_if_managed mgd; struct ieee80211_if_ibss ibss; // <- shares address struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; // <- shares address struct ieee80211_if_mntr mntr; struct ieee80211_if_nan nan; } u; ... } Therefore add handling of otype == NL80211_IFTYPE_OCB, during cfg80211_change_iface() to perform cleanup when leaving OCB mode. link to syzkaller bug: https://syzkaller.appspot.com/bug?id=0612dbfa595bf4b9b680ff7b4948257b8e3732d5 Reported-by: syzbot+105896fac213f26056f9@syzkaller.appspotmail.com Signed-off-by: Du Cheng Link: https://lore.kernel.org/r/20210428063941.105161-1-ducheng2@gmail.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/wireless/util.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index b3895a8a48ab..bf4dd297a4db 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1041,6 +1041,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; + case NL80211_IFTYPE_OCB: + cfg80211_leave_ocb(rdev, dev); + break; default: break; } From a1440a9ab87ae6d38a25b740966cc8eb55b6076e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 9 Jun 2021 16:13:06 +0200 Subject: [PATCH 076/116] mac80211: drop multicast fragments [ Upstream commit a9799541ca34652d9996e45f80e8e03144c12949 ] These are not permitted by the spec, just drop them. Link: https://lore.kernel.org/r/20210609161305.23def022b750.Ibd6dd3cdce573dae262fcdc47f8ac52b883a9c50@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/rx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6b4fd56800f7..ac2c52709e1c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2014,17 +2014,15 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) sc = le16_to_cpu(hdr->seq_ctrl); frag = sc & IEEE80211_SCTL_FRAG; - if (is_multicast_ether_addr(hdr->addr1)) { - I802_DEBUG_INC(rx->local->dot11MulticastReceivedFrameCount); - goto out_no_led; - } - if (rx->sta) cache = &rx->sta->frags; if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) goto out; + if (is_multicast_ether_addr(hdr->addr1)) + return RX_DROP_MONITOR; + I802_DEBUG_INC(rx->local->rx_handlers_fragments); if (skb_linearize(rx->skb)) @@ -2150,7 +2148,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) out: ieee80211_led_rx(rx->local); - out_no_led: if (rx->sta) rx->sta->rx_stats.packets++; return RX_CONTINUE; From e1df54a22b5ced83a6fa49cc05d2912e6607a720 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Thu, 10 Jun 2021 09:41:36 +0800 Subject: [PATCH 077/116] ping: Check return value of function 'ping_queue_rcv_skb' [ Upstream commit 9d44fa3e50cc91691896934d106c86e4027e61ca ] Function 'ping_queue_rcv_skb' not always return success, which will also return fail. If not check the wrong return value of it, lead to function `ping_rcv` return success. Signed-off-by: Zheng Yongjun Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/ping.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 186fdf0922d2..aab141c4a389 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -978,6 +978,7 @@ bool ping_rcv(struct sk_buff *skb) struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + bool rc = false; /* We assume the packet has already been checked by icmp_rcv */ @@ -992,14 +993,15 @@ bool ping_rcv(struct sk_buff *skb) struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); - if (skb2) - ping_queue_rcv_skb(sk, skb2); + if (skb2 && !ping_queue_rcv_skb(sk, skb2)) + rc = true; sock_put(sk); - return true; } - pr_debug("no socket, dropping\n"); - return false; + if (!rc) + pr_debug("no socket, dropping\n"); + + return rc; } EXPORT_SYMBOL_GPL(ping_rcv); From c5c9b1fd9b9e1767a691aca98f2b125e8b9d5162 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Jun 2021 07:44:11 -0700 Subject: [PATCH 078/116] inet: annotate date races around sk->sk_txhash [ Upstream commit b71eaed8c04f72a919a9c44e83e4ee254e69e7f3 ] UDP sendmsg() path can be lockless, it is possible for another thread to re-connect an change sk->sk_txhash under us. There is no serious impact, but we can use READ_ONCE()/WRITE_ONCE() pair to document the race. BUG: KCSAN: data-race in __ip4_datagram_connect / skb_set_owner_w write to 0xffff88813397920c of 4 bytes by task 30997 on cpu 1: sk_set_txhash include/net/sock.h:1937 [inline] __ip4_datagram_connect+0x69e/0x710 net/ipv4/datagram.c:75 __ip6_datagram_connect+0x551/0x840 net/ipv6/datagram.c:189 ip6_datagram_connect+0x2a/0x40 net/ipv6/datagram.c:272 inet_dgram_connect+0xfd/0x180 net/ipv4/af_inet.c:580 __sys_connect_file net/socket.c:1837 [inline] __sys_connect+0x245/0x280 net/socket.c:1854 __do_sys_connect net/socket.c:1864 [inline] __se_sys_connect net/socket.c:1861 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1861 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88813397920c of 4 bytes by task 31039 on cpu 0: skb_set_hash_from_sk include/net/sock.h:2211 [inline] skb_set_owner_w+0x118/0x220 net/core/sock.c:2101 sock_alloc_send_pskb+0x452/0x4e0 net/core/sock.c:2359 sock_alloc_send_skb+0x2d/0x40 net/core/sock.c:2373 __ip6_append_data+0x1743/0x21a0 net/ipv6/ip6_output.c:1621 ip6_make_skb+0x258/0x420 net/ipv6/ip6_output.c:1983 udpv6_sendmsg+0x160a/0x16b0 net/ipv6/udp.c:1527 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:642 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xbca3c43d -> 0xfdb309e0 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 31039 Comm: syz-executor.2 Not tainted 5.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 55d16db84ea4..70fe85bee4e5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1744,7 +1744,8 @@ static inline u32 net_tx_rndhash(void) static inline void sk_set_txhash(struct sock *sk) { - sk->sk_txhash = net_tx_rndhash(); + /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ + WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline void sk_rethink_txhash(struct sock *sk) @@ -2018,9 +2019,12 @@ static inline void sock_poll_wait(struct file *filp, static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { - if (sk->sk_txhash) { + /* This pairs with WRITE_ONCE() in sk_set_txhash() */ + u32 txhash = READ_ONCE(sk->sk_txhash); + + if (txhash) { skb->l4_hash = 1; - skb->hash = sk->sk_txhash; + skb->hash = txhash; } } From 8afbbbbca1f02f925f128f7d7801dbfc84eaaf14 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sat, 12 Jun 2021 17:51:22 +0300 Subject: [PATCH 079/116] net: caif: fix memory leak in ldisc_open [ Upstream commit 58af3d3d54e87bfc1f936e16c04ade3369d34011 ] Syzbot reported memory leak in tty_init_dev(). The problem was in unputted tty in ldisc_open() static int ldisc_open(struct tty_struct *tty) { ... ser->tty = tty_kref_get(tty); ... result = register_netdevice(dev); if (result) { rtnl_unlock(); free_netdev(dev); return -ENODEV; } ... } Ser pointer is netdev private_data, so after free_netdev() this pointer goes away with unputted tty reference. So, fix it by adding tty_kref_put() before freeing netdev. Reported-and-tested-by: syzbot+f303e045423e617d2cad@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/caif/caif_serial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c index ce76ed50a1a2..1516d621e040 100644 --- a/drivers/net/caif/caif_serial.c +++ b/drivers/net/caif/caif_serial.c @@ -360,6 +360,7 @@ static int ldisc_open(struct tty_struct *tty) rtnl_lock(); result = register_netdevice(dev); if (result) { + tty_kref_put(tty); rtnl_unlock(); free_netdev(dev); return -ENODEV; From 3d419f6ded7f726cf337a3e383481d8e74783556 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2021 06:42:01 -0700 Subject: [PATCH 080/116] net/packet: annotate accesses to po->bind [ Upstream commit c7d2ef5dd4b03ed0ee1d13bc0c55f9cf62d49bd6 ] tpacket_snd(), packet_snd(), packet_getname() and packet_seq_show() can read po->num without holding a lock. This means other threads can change po->num at the same time. KCSAN complained about this known fact [1] Add READ_ONCE()/WRITE_ONCE() to address the issue. [1] BUG: KCSAN: data-race in packet_do_bind / packet_sendmsg write to 0xffff888131a0dcc0 of 2 bytes by task 24714 on cpu 0: packet_do_bind+0x3ab/0x7e0 net/packet/af_packet.c:3181 packet_bind+0xc3/0xd0 net/packet/af_packet.c:3255 __sys_bind+0x200/0x290 net/socket.c:1637 __do_sys_bind net/socket.c:1648 [inline] __se_sys_bind net/socket.c:1646 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1646 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888131a0dcc0 of 2 bytes by task 24719 on cpu 1: packet_snd net/packet/af_packet.c:2899 [inline] packet_sendmsg+0x317/0x3570 net/packet/af_packet.c:3040 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2433 __do_sys_sendmsg net/socket.c:2442 [inline] __se_sys_sendmsg net/socket.c:2440 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2440 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000 -> 0x1200 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24719 Comm: syz-executor.5 Not tainted 5.13.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/packet/af_packet.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b62ec43ed54f..6f55942619d4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2694,7 +2694,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); - proto = po->num; + proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2907,7 +2907,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); - proto = po->num; + proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -3177,7 +3177,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, /* prevents packet_notifier() from calling * register_prot_hook() */ - po->num = 0; + WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); dev_curr = po->prot_hook.dev; @@ -3187,7 +3187,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, } BUG_ON(po->running); - po->num = proto; + WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; if (unlikely(unlisted)) { @@ -3534,7 +3534,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; - sll->sll_protocol = po->num; + sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); @@ -4429,7 +4429,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, was_running = po->running; num = po->num; if (was_running) { - po->num = 0; + WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); @@ -4464,7 +4464,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, spin_lock(&po->bind_lock); if (was_running) { - po->num = num; + WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); @@ -4635,7 +4635,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) s, refcount_read(&s->sk_refcnt), s->sk_type, - ntohs(po->num), + ntohs(READ_ONCE(po->num)), po->ifindex, po->running, atomic_read(&s->sk_rmem_alloc), From f0cc4253846222c30aed8b992bd9b59c0ab00871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2021 06:42:02 -0700 Subject: [PATCH 081/116] net/packet: annotate accesses to po->ifindex [ Upstream commit e032f7c9c7cefffcfb79b9fc16c53011d2d9d11f ] Like prior patch, we need to annotate lockless accesses to po->ifindex For instance, packet_getname() is reading po->ifindex (twice) while another thread is able to change po->ifindex. KCSAN reported: BUG: KCSAN: data-race in packet_do_bind / packet_getname write to 0xffff888143ce3cbc of 4 bytes by task 25573 on cpu 1: packet_do_bind+0x420/0x7e0 net/packet/af_packet.c:3191 packet_bind+0xc3/0xd0 net/packet/af_packet.c:3255 __sys_bind+0x200/0x290 net/socket.c:1637 __do_sys_bind net/socket.c:1648 [inline] __se_sys_bind net/socket.c:1646 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1646 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888143ce3cbc of 4 bytes by task 25578 on cpu 0: packet_getname+0x5b/0x1a0 net/packet/af_packet.c:3525 __sys_getsockname+0x10e/0x1a0 net/socket.c:1887 __do_sys_getsockname net/socket.c:1902 [inline] __se_sys_getsockname net/socket.c:1899 [inline] __x64_sys_getsockname+0x3e/0x50 net/socket.c:1899 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25578 Comm: syz-executor.5 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/packet/af_packet.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 6f55942619d4..50ca70b3c175 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3193,11 +3193,11 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (unlikely(unlisted)) { dev_put(dev); po->prot_hook.dev = NULL; - po->ifindex = -1; + WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { po->prot_hook.dev = dev; - po->ifindex = dev ? dev->ifindex : 0; + WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } } @@ -3512,7 +3512,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); + dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); @@ -3528,16 +3528,18 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); + int ifindex; if (peer) return -EOPNOTSUPP; + ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; - sll->sll_ifindex = po->ifindex; + sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; @@ -4117,7 +4119,7 @@ static int packet_notifier(struct notifier_block *this, } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); - po->ifindex = -1; + WRITE_ONCE(po->ifindex, -1); if (po->prot_hook.dev) dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; @@ -4636,7 +4638,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), - po->ifindex, + READ_ONCE(po->ifindex), po->running, atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), From a7743a9559b600729d7fe511749fb093cc342ff9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Jun 2021 12:53:03 -0700 Subject: [PATCH 082/116] r8152: Avoid memcpy() over-reading of ETH_SS_STATS [ Upstream commit 99718abdc00e86e4f286dd836408e2834886c16e ] In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally reading across neighboring array fields. The memcpy() is copying the entire structure, not just the first array. Adjust the source argument so the compiler can do appropriate bounds checking. Signed-off-by: Kees Cook Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/r8152.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index f9c531a6ce06..8da3c891c9e8 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -4640,7 +4640,7 @@ static void rtl8152_get_strings(struct net_device *dev, u32 stringset, u8 *data) { switch (stringset) { case ETH_SS_STATS: - memcpy(data, *rtl8152_gstrings, sizeof(rtl8152_gstrings)); + memcpy(data, rtl8152_gstrings, sizeof(rtl8152_gstrings)); break; } } From c844b7c40a02968e52ab9a5b9d2ae93592970d64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Jun 2021 12:53:33 -0700 Subject: [PATCH 083/116] sh_eth: Avoid memcpy() over-reading of ETH_SS_STATS [ Upstream commit 224004fbb033600715dbd626bceec10bfd9c58bc ] In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally reading across neighboring array fields. The memcpy() is copying the entire structure, not just the first array. Adjust the source argument so the compiler can do appropriate bounds checking. Signed-off-by: Kees Cook Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/renesas/sh_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index dab1597287b9..36f1019809ea 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2197,7 +2197,7 @@ static void sh_eth_get_strings(struct net_device *ndev, u32 stringset, u8 *data) { switch (stringset) { case ETH_SS_STATS: - memcpy(data, *sh_eth_gstrings_stats, + memcpy(data, sh_eth_gstrings_stats, sizeof(sh_eth_gstrings_stats)); break; } From 45d17208cb47604c6d3c6260c96ba87aa9e11058 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Jun 2021 12:53:59 -0700 Subject: [PATCH 084/116] r8169: Avoid memcpy() over-reading of ETH_SS_STATS [ Upstream commit da5ac772cfe2a03058b0accfac03fad60c46c24d ] In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), memmove(), and memset(), avoid intentionally reading across neighboring array fields. The memcpy() is copying the entire structure, not just the first array. Adjust the source argument so the compiler can do appropriate bounds checking. Signed-off-by: Kees Cook Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/realtek/r8169.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 530b8da11960..191531a03415 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -2355,7 +2355,7 @@ static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data) { switch(stringset) { case ETH_SS_STATS: - memcpy(data, *rtl8169_gstrings, sizeof(rtl8169_gstrings)); + memcpy(data, rtl8169_gstrings, sizeof(rtl8169_gstrings)); break; } } From be0cfcf7e6988dbff80edec0d0654878f4d6f955 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jun 2021 10:09:53 -0700 Subject: [PATCH 085/116] net: qed: Fix memcpy() overflow of qed_dcbx_params() [ Upstream commit 1c200f832e14420fa770193f9871f4ce2df00d07 ] The source (&dcbx_info->operational.params) and dest (&p_hwfn->p_dcbx_info->set.config.params) are both struct qed_dcbx_params (560 bytes), not struct qed_dcbx_admin_params (564 bytes), which is used as the memcpy() size. However it seems that struct qed_dcbx_operational_params (dcbx_info->operational)'s layout matches struct qed_dcbx_admin_params (p_hwfn->p_dcbx_info->set.config)'s 4 byte difference (3 padding, 1 byte for "valid"). On the assumption that the size is wrong (rather than the source structure type), adjust the memcpy() size argument to be 4 bytes smaller and add a BUILD_BUG_ON() to validate any changes to the structure sizes. Signed-off-by: Kees Cook Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index d62dccb85539..1ee58a24afe3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -1259,9 +1259,11 @@ int qed_dcbx_get_config_params(struct qed_hwfn *p_hwfn, p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_STATIC; p_hwfn->p_dcbx_info->set.enabled = dcbx_info->operational.enabled; + BUILD_BUG_ON(sizeof(dcbx_info->operational.params) != + sizeof(p_hwfn->p_dcbx_info->set.config.params)); memcpy(&p_hwfn->p_dcbx_info->set.config.params, &dcbx_info->operational.params, - sizeof(struct qed_dcbx_admin_params)); + sizeof(p_hwfn->p_dcbx_info->set.config.params)); p_hwfn->p_dcbx_info->set.config.valid = true; memcpy(params, &p_hwfn->p_dcbx_info->set, sizeof(struct qed_dcbx_set)); From 15bec9922c359516340eb7338534fb576fb7ca6d Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 18 Jun 2021 12:52:38 +0200 Subject: [PATCH 086/116] net: ll_temac: Avoid ndo_start_xmit returning NETDEV_TX_BUSY [ Upstream commit f6396341194234e9b01cd7538bc2c6ac4501ab14 ] As documented in Documentation/networking/driver.rst, the ndo_start_xmit method must not return NETDEV_TX_BUSY under any normal circumstances, and as recommended, we simply stop the tx queue in advance, when there is a risk that the next xmit would cause a NETDEV_TX_BUSY return. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/xilinx/ll_temac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 2241f9897092..939de185bc6b 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -736,6 +736,11 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev) /* Kick off the transfer */ lp->dma_out(lp, TX_TAILDESC_PTR, tail_p); /* DMA start */ + if (temac_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) { + netdev_info(ndev, "%s -> netif_stop_queue\n", __func__); + netif_stop_queue(ndev); + } + return NETDEV_TX_OK; } From c6dd378a470acd35323497cee36b539ff546138d Mon Sep 17 00:00:00 2001 From: Fabien Dessenne Date: Thu, 17 Jun 2021 16:46:29 +0200 Subject: [PATCH 087/116] pinctrl: stm32: fix the reported number of GPIO lines per bank [ Upstream commit 67e2996f72c71ebe4ac2fcbcf77e54479bb7aa11 ] Each GPIO bank supports a variable number of lines which is usually 16, but is less in some cases : this is specified by the last argument of the "gpio-ranges" bank node property. Report to the framework, the actual number of lines, so the libgpiod gpioinfo command lists the actually existing GPIO lines. Fixes: 1dc9d289154b ("pinctrl: stm32: add possibility to use gpio-ranges to declare bank range") Signed-off-by: Fabien Dessenne Link: https://lore.kernel.org/r/20210617144629.2557693-1-fabien.dessenne@foss.st.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/stm32/pinctrl-stm32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 072bd11074c6..b38e82a868df 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -956,7 +956,7 @@ static int stm32_gpiolib_register_bank(struct stm32_pinctrl *pctl, struct resource res; struct reset_control *rstc; int npins = STM32_GPIO_PINS_PER_BANK; - int bank_nr, err; + int bank_nr, err, i = 0; rstc = of_reset_control_get_exclusive(np, NULL); if (!IS_ERR(rstc)) @@ -985,9 +985,14 @@ static int stm32_gpiolib_register_bank(struct stm32_pinctrl *pctl, of_property_read_string(np, "st,bank-name", &bank->gpio_chip.label); - if (!of_parse_phandle_with_fixed_args(np, "gpio-ranges", 3, 0, &args)) { + if (!of_parse_phandle_with_fixed_args(np, "gpio-ranges", 3, i, &args)) { bank_nr = args.args[1] / STM32_GPIO_PINS_PER_BANK; bank->gpio_chip.base = args.args[1]; + + npins = args.args[2]; + while (!of_parse_phandle_with_fixed_args(np, "gpio-ranges", 3, + ++i, &args)) + npins += args.args[2]; } else { bank_nr = pctl->nbanks; bank->gpio_chip.base = bank_nr * STM32_GPIO_PINS_PER_BANK; From 49e3def271f65f893ca37158d902ad168838bcaf Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 24 Jun 2021 18:39:33 -0700 Subject: [PATCH 088/116] nilfs2: fix memory leak in nilfs_sysfs_delete_device_group [ Upstream commit 8fd0c1b0647a6bda4067ee0cd61e8395954b6f28 ] My local syzbot instance hit memory leak in nilfs2. The problem was in missing kobject_put() in nilfs_sysfs_delete_device_group(). kobject_del() does not call kobject_cleanup() for passed kobject and it leads to leaking duped kobject name if kobject_put() was not called. Fail log: BUG: memory leak unreferenced object 0xffff8880596171e0 (size 8): comm "syz-executor379", pid 8381, jiffies 4294980258 (age 21.100s) hex dump (first 8 bytes): 6c 6f 6f 70 30 00 00 00 loop0... backtrace: kstrdup+0x36/0x70 mm/util.c:60 kstrdup_const+0x53/0x80 mm/util.c:83 kvasprintf_const+0x108/0x190 lib/kasprintf.c:48 kobject_set_name_vargs+0x56/0x150 lib/kobject.c:289 kobject_add_varg lib/kobject.c:384 [inline] kobject_init_and_add+0xc9/0x160 lib/kobject.c:473 nilfs_sysfs_create_device_group+0x150/0x800 fs/nilfs2/sysfs.c:999 init_nilfs+0xe26/0x12b0 fs/nilfs2/the_nilfs.c:637 Link: https://lkml.kernel.org/r/20210612140559.20022-1-paskripkin@gmail.com Fixes: da7141fb78db ("nilfs2: add /sys/fs/nilfs2/ group") Signed-off-by: Pavel Skripkin Acked-by: Ryusuke Konishi Cc: Michael L. Semon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/nilfs2/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 490303e3d517..e9903bceb2bf 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1064,6 +1064,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) nilfs_sysfs_delete_superblock_group(nilfs); nilfs_sysfs_delete_segctor_group(nilfs); kobject_del(&nilfs->ns_dev_kobj); + kobject_put(&nilfs->ns_dev_kobj); kfree(nilfs->ns_dev_subgroups); } From 8ab0e49cc8eeabae4f31805e979173bb00ee472e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 24 May 2021 11:09:12 +0200 Subject: [PATCH 089/116] i2c: robotfuzz-osif: fix control-request directions commit 4ca070ef0dd885616ef294d269a9bf8e3b258e1a upstream. The direction of the pipe argument must match the request-type direction bit or control requests may fail depending on the host-controller-driver implementation. Control transfers without a data stage are treated as OUT requests by the USB stack and should be using usb_sndctrlpipe(). Failing to do so will now trigger a warning. Fix the OSIFI2C_SET_BIT_RATE and OSIFI2C_STOP requests which erroneously used the osif_usb_read() helper and set the IN direction bit. Reported-by: syzbot+9d7dadd15b8819d73f41@syzkaller.appspotmail.com Fixes: 83e53a8f120f ("i2c: Add bus driver for for OSIF USB i2c device.") Cc: stable@vger.kernel.org # 3.14 Signed-off-by: Johan Hovold Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-robotfuzz-osif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-robotfuzz-osif.c b/drivers/i2c/busses/i2c-robotfuzz-osif.c index 9c0f52b7ff7e..b9b4758c6be7 100644 --- a/drivers/i2c/busses/i2c-robotfuzz-osif.c +++ b/drivers/i2c/busses/i2c-robotfuzz-osif.c @@ -89,7 +89,7 @@ static int osif_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs, } } - ret = osif_usb_read(adapter, OSIFI2C_STOP, 0, 0, NULL, 0); + ret = osif_usb_write(adapter, OSIFI2C_STOP, 0, 0, NULL, 0); if (ret) { dev_err(&adapter->dev, "failure sending STOP\n"); return -EREMOTEIO; @@ -159,7 +159,7 @@ static int osif_probe(struct usb_interface *interface, * Set bus frequency. The frequency is: * 120,000,000 / ( 16 + 2 * div * 4^prescale). * Using dev = 52, prescale = 0 give 100KHz */ - ret = osif_usb_read(&priv->adapter, OSIFI2C_SET_BIT_RATE, 52, 0, + ret = osif_usb_write(&priv->adapter, OSIFI2C_SET_BIT_RATE, 52, 0, NULL, 0); if (ret) { dev_err(&interface->dev, "failure sending bit rate"); From 313e82bbefb3d1d926858b58092f5d50f41d924d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 30 Jun 2021 09:21:52 -0400 Subject: [PATCH 090/116] Linux 4.14.238 Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Tested-by: Hulk Robot Signed-off-by: Sasha Levin --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e2977b8f8e6a..5442918651e0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 237 +SUBLEVEL = 238 EXTRAVERSION = NAME = Petit Gorille From 951fe4bf532512fe8e88408d329a64119b25a854 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 5 Apr 2018 16:25:30 -0700 Subject: [PATCH 091/116] include/linux/mmdebug.h: make VM_WARN* non-rvals [ Upstream commit 91241681c62a5a690c88eb2aca027f094125eaac ] At present the construct if (VM_WARN(...)) will compile OK with CONFIG_DEBUG_VM=y and will fail with CONFIG_DEBUG_VM=n. The reason is that VM_{WARN,BUG}* have always been special wrt. {WARN/BUG}* and never generate any code when DEBUG_VM is disabled. So we cannot really use it in conditionals. We considered changing things so that this construct works in both cases but that might cause unwanted code generation with CONFIG_DEBUG_VM=n. It is safer and simpler to make the build fail in both cases. [akpm@linux-foundation.org: changelog] Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/mmdebug.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 57b0030d3800..2ad72d2c8cc5 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -37,10 +37,10 @@ void dump_mm(const struct mm_struct *mm); BUG(); \ } \ } while (0) -#define VM_WARN_ON(cond) WARN_ON(cond) -#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) -#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) -#define VM_WARN(cond, format...) WARN(cond, format) +#define VM_WARN_ON(cond) (void)WARN_ON(cond) +#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VM_WARN(cond, format...) (void)WARN(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) From 37a4a68cd12bfa404bea4dea71cbef35cd63614e Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 18 Dec 2020 14:01:31 -0800 Subject: [PATCH 092/116] mm: add VM_WARN_ON_ONCE_PAGE() macro [ Upstream commit a4055888629bc0467d12d912cd7c90acdf3d9b12 part ] Add VM_WARN_ON_ONCE_PAGE() macro. Link: https://lkml.kernel.org/r/1604283436-18880-3-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Michal Hocko Acked-by: Hugh Dickins Acked-by: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Note on stable backport: original commit was titled mm/memcg: warning on !memcg after readahead page charged which included uses of this macro in mm/memcontrol.c: here omitted. Signed-off-by: Hugh Dickins Signed-off-by: Sasha Levin --- include/linux/mmdebug.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 2ad72d2c8cc5..5d0767cb424a 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm); BUG(); \ } \ } while (0) +#define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) + #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) @@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif From a369974d1547fc41bdab5e19186303c0ac31dc5a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:17:56 -0800 Subject: [PATCH 093/116] mm/rmap: remove unneeded semicolon in page_not_mapped() [ Upstream commit e0af87ff7afcde2660be44302836d2d5618185af ] Remove extra semicolon without any functional change intended. Link: https://lkml.kernel.org/r/20210127093425.39640-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index 8bd2ddd8febd..e2506b6adb6a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1671,7 +1671,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) static int page_not_mapped(struct page *page) { return !page_mapped(page); -}; +} /** * try_to_munlock - try to munlock a page From 1decdcdf8ac3ea56b6001b3c60f35c7b15daabb3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 25 Feb 2021 17:18:03 -0800 Subject: [PATCH 094/116] mm/rmap: use page_not_mapped in try_to_unmap() [ Upstream commit b7e188ec98b1644ff70a6d3624ea16aadc39f5e0 ] page_mapcount_is_zero() calculates accurately how many mappings a hugepage has in order to check against 0 only. This is a waste of cpu time. We can do this via page_not_mapped() to save some possible atomic_read cycles. Remove the function page_mapcount_is_zero() as it's not used anymore and move page_not_mapped() above try_to_unmap() to avoid identifier undeclared compilation error. Link: https://lkml.kernel.org/r/20210130084904.35307-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/rmap.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index e2506b6adb6a..e6a556fec9d1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1624,9 +1624,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -static int page_mapcount_is_zero(struct page *page) +static int page_not_mapped(struct page *page) { - return !total_mapcount(page); + return !page_mapped(page); } /** @@ -1644,7 +1644,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_mapcount_is_zero, + .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1668,11 +1668,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) return !page_mapcount(page) ? true : false; } -static int page_not_mapped(struct page *page) -{ - return !page_mapped(page); -} - /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked From 97cd3badbd3432cf40143f2553a3c1ab38346847 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 15 Jun 2021 18:23:53 -0700 Subject: [PATCH 095/116] mm/thp: try_to_unmap() use TTU_SYNC for safe splitting [ Upstream commit 732ed55823fc3ad998d43b86bf771887bcc5ec67 ] Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE (!unmap_success): with dump_page() showing mapcount:1, but then its raw struct page output showing _mapcount ffffffff i.e. mapcount 0. And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed, it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)), and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG(): all indicative of some mapcount difficulty in development here perhaps. But the !CONFIG_DEBUG_VM path handles the failures correctly and silently. I believe the problem is that once a racing unmap has cleared pte or pmd, try_to_unmap_one() may skip taking the page table lock, and emerge from try_to_unmap() before the racing task has reached decrementing mapcount. Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC to the options, and passing that from unmap_page(). When CONFIG_DEBUG_VM, or for non-debug too? Consensus is to do the same for both: the slight overhead added should rarely matter, except perhaps if splitting sparsely-populated multiply-mapped shmem. Once confident that bugs are fixed, TTU_SYNC here can be removed, and the race tolerated. Link: https://lkml.kernel.org/r/c1e95853-8bcd-d8fd-55fa-e7f2488e78f@google.com Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers") Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Jan Kara Cc: Jue Wang Cc: Kirill A. Shutemov Cc: "Matthew Wilcox (Oracle)" Cc: Miaohe Lin Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ralph Campbell Cc: Shakeel Butt Cc: Wang Yugui Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Note on stable backport: upstream TTU_SYNC 0x10 takes the value which 5.11 commit 013339df116c ("mm/rmap: always do TTU_IGNORE_ACCESS") freed. It is very tempting to backport that commit (as 5.10 already did) and make no change here; but on reflection, good as that commit is, I'm reluctant to include any possible side-effect of it in this series. Signed-off-by: Hugh Dickins Signed-off-by: Sasha Levin --- include/linux/rmap.h | 3 ++- mm/huge_memory.c | 2 +- mm/page_vma_mapped.c | 11 +++++++++++ mm/rmap.c | 17 ++++++++++++++++- 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d7d6d4eb1794..91ccae946716 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -98,7 +98,8 @@ enum ttu_flags { * do a final flush if necessary */ TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ - TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ + TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ + TTU_SYNC = 0x200, /* avoid racy checks with PVMW_SYNC */ }; #ifdef CONFIG_MMU diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 513f0cf173ad..5705ccff3e7f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2324,7 +2324,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | - TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e00d985a51c5..31879f2175d0 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -207,6 +207,17 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) pvmw->ptl = NULL; } } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + PageTransCompound(pvmw->page)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + + spin_unlock(ptl); + } return false; } if (!map_pte(pvmw)) diff --git a/mm/rmap.c b/mm/rmap.c index e6a556fec9d1..b6571c739723 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1344,6 +1344,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long start = address, end; enum ttu_flags flags = (enum ttu_flags)arg; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; @@ -1665,7 +1674,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) else rmap_walk(page, &rwc); - return !page_mapcount(page) ? true : false; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + return !page_mapcount(page); } /** From 4dfa0d6f482311db9e89f53da73d121c47ca2a7d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 15 Jun 2021 18:23:56 -0700 Subject: [PATCH 096/116] mm/thp: fix vma_address() if virtual address below file offset [ Upstream commit 494334e43c16d63b878536a26505397fce6ff3a2 ] Running certain tests with a DEBUG_VM kernel would crash within hours, on the total_mapcount BUG() in split_huge_page_to_list(), while trying to free up some memory by punching a hole in a shmem huge page: split's try_to_unmap() was unable to find all the mappings of the page (which, on a !DEBUG_VM kernel, would then keep the huge page pinned in memory). When that BUG() was changed to a WARN(), it would later crash on the VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma) in mm/internal.h:vma_address(), used by rmap_walk_file() for try_to_unmap(). vma_address() is usually correct, but there's a wraparound case when the vm_start address is unusually low, but vm_pgoff not so low: vma_address() chooses max(start, vma->vm_start), but that decides on the wrong address, because start has become almost ULONG_MAX. Rewrite vma_address() to be more careful about vm_pgoff; move the VM_BUG_ON_VMA() out of it, returning -EFAULT for errors, so that it can be safely used from page_mapped_in_vma() and page_address_in_vma() too. Add vma_address_end() to apply similar care to end address calculation, in page_vma_mapped_walk() and page_mkclean_one() and try_to_unmap_one(); though it raises a question of whether callers would do better to supply pvmw->end to page_vma_mapped_walk() - I chose not, for a smaller patch. An irritation is that their apparent generality breaks down on KSM pages, which cannot be located by the page->index that page_to_pgoff() uses: as commit 4b0ece6fa016 ("mm: migrate: fix remove_migration_pte() for ksm pages") once discovered. I dithered over the best thing to do about that, and have ended up with a VM_BUG_ON_PAGE(PageKsm) in both vma_address() and vma_address_end(); though the only place in danger of using it on them was try_to_unmap_one(). Sidenote: vma_address() and vma_address_end() now use compound_nr() on a head page, instead of thp_size(): to make the right calculation on a hugetlbfs page, whether or not THPs are configured. try_to_unmap() is used on hugetlbfs pages, but perhaps the wrong calculation never mattered. Link: https://lkml.kernel.org/r/caf1c1a3-7cfb-7f8f-1beb-ba816e932825@google.com Fixes: a8fa41ad2f6f ("mm, rmap: check all VMAs that PTE-mapped THP can be part of") Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Jan Kara Cc: Jue Wang Cc: "Matthew Wilcox (Oracle)" Cc: Miaohe Lin Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ralph Campbell Cc: Shakeel Butt Cc: Wang Yugui Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Note on stable backport: fixed up conflicts on intervening thp_size(), and mmu_notifier_range initializations; substitute for compound_nr(). Signed-off-by: Hugh Dickins Signed-off-by: Sasha Levin --- mm/internal.h | 53 ++++++++++++++++++++++++++++++++------------ mm/page_vma_mapped.c | 16 +++++-------- mm/rmap.c | 14 ++++++------ 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index a182506242c4..97c8e896cd2f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -330,27 +330,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* - * At what user virtual address is page expected in @vma? + * At what user virtual address is page expected in vma? + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. */ static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) +vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page); + if (pgoff >= vma->vm_pgoff) { + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address >= vma->vm_end) + address = -EFAULT; + } else if (PageHead(page) && + pgoff + (1UL << compound_order(page)) - 1 >= vma->vm_pgoff) { + /* Test above avoids possibility of wrap to 0 on 32-bit */ + address = vma->vm_start; + } else { + address = -EFAULT; + } + return address; } +/* + * Then at what user virtual address will none of the page be found in vma? + * Assumes that vma_address() already returned a good starting address. + * If page is a compound head, the entire compound page is considered. + */ static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +vma_address_end(struct page *page, struct vm_area_struct *vma) { - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); - - return max(start, vma->vm_start); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page) + (1UL << compound_order(page)); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address > vma->vm_end) + address = vma->vm_end; + return address; } #else /* !CONFIG_MMU */ diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 31879f2175d0..340207ba3743 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -223,18 +223,18 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (!map_pte(pvmw)) goto next_pte; while (1) { + unsigned long end; + if (check_pte(pvmw)) return true; next_pte: /* Seek to next pte only makes sense for THP */ if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) return not_found(pvmw); + end = vma_address_end(pvmw->page, pvmw->vma); do { pvmw->address += PAGE_SIZE; - if (pvmw->address >= pvmw->vma->vm_end || - pvmw->address >= - __vma_address(pvmw->page, pvmw->vma) + - hpage_nr_pages(pvmw->page) * PAGE_SIZE) + if (pvmw->address >= end) return not_found(pvmw); /* Did we cross page table boundary? */ if (pvmw->address % PMD_SIZE == 0) { @@ -272,14 +272,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .vma = vma, .flags = PVMW_SYNC, }; - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); - if (unlikely(end < vma->vm_start || start >= vma->vm_end)) + pvmw.address = vma_address(page, vma); + if (pvmw.address == -EFAULT) return 0; - pvmw.address = max(start, vma->vm_start); if (!page_vma_mapped_walk(&pvmw)) return 0; page_vma_mapped_walk_done(&pvmw); diff --git a/mm/rmap.c b/mm/rmap.c index b6571c739723..bebe29a2c5f2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -686,7 +686,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -701,10 +700,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else return -EFAULT; - address = __vma_address(page, vma); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - return -EFAULT; - return address; + + return vma_address(page, vma); } pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) @@ -896,7 +893,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + end = vma_address_end(page, vma); mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { @@ -1374,7 +1371,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + end = PageKsm(page) ? + address + PAGE_SIZE : vma_address_end(page, vma); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted @@ -1777,6 +1775,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) @@ -1831,6 +1830,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) From d5d912c4c36f97112dd1545bfbfc71c06201d345 Mon Sep 17 00:00:00 2001 From: Jue Wang Date: Tue, 15 Jun 2021 18:24:00 -0700 Subject: [PATCH 097/116] mm/thp: fix page_address_in_vma() on file THP tails [ Upstream commit 31657170deaf1d8d2f6a1955fbc6fa9d228be036 ] Anon THP tails were already supported, but memory-failure may need to use page_address_in_vma() on file THP tails, which its page->mapping check did not permit: fix it. hughd adds: no current usage is known to hit the issue, but this does fix a subtle trap in a general helper: best fixed in stable sooner than later. Link: https://lkml.kernel.org/r/a0d9b53-bf5d-8bab-ac5-759dc61819c1@google.com Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Signed-off-by: Jue Wang Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Yang Shi Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Jan Kara Cc: Miaohe Lin Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ralph Campbell Cc: Shakeel Butt Cc: Wang Yugui Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/rmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index bebe29a2c5f2..8ed8ec113d5a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -695,11 +695,11 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping) { - if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) - return -EFAULT; - } else + } else if (!vma->vm_file) { + return -EFAULT; + } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { return -EFAULT; + } return vma_address(page, vma); } From b5acf9a91826ed8a5f9141a7e0c7420740a8894e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 15 Jun 2021 18:24:07 -0700 Subject: [PATCH 098/116] mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split [ Upstream commit 504e070dc08f757bccaed6d05c0f53ecbfac8a23 ] When debugging the bug reported by Wang Yugui [1], try_to_unmap() may fail, but the first VM_BUG_ON_PAGE() just checks page_mapcount() however it may miss the failure when head page is unmapped but other subpage is mapped. Then the second DEBUG_VM BUG() that check total mapcount would catch it. This may incur some confusion. As this is not a fatal issue, so consolidate the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE(). [1] https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/ Link: https://lkml.kernel.org/r/d0f0db68-98b8-ebfb-16dc-f29df24cf012@google.com Signed-off-by: Yang Shi Reviewed-by: Zi Yan Acked-by: Kirill A. Shutemov Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Jan Kara Cc: Jue Wang Cc: "Matthew Wilcox (Oracle)" Cc: Miaohe Lin Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ralph Campbell Cc: Shakeel Butt Cc: Wang Yugui Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Note on stable backport: fixed up variables, split_queue_lock, tree_lock in split_huge_page_to_list(), and conflict on ttu_flags in unmap_page(). Signed-off-by: Hugh Dickins Signed-off-by: Sasha Levin --- mm/huge_memory.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5705ccff3e7f..972893908bcd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2325,15 +2325,15 @@ static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; - bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_SPLIT_FREEZE; - unmap_success = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(!unmap_success, page); + try_to_unmap(page, ttu_flags); + + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct page *page) @@ -2586,7 +2586,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - int count, mapcount, extra_pins, ret; + int extra_pins, ret; bool mlocked; unsigned long flags; pgoff_t end; @@ -2648,7 +2648,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) mlocked = PageMlocked(page); unmap_page(head); - VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ if (mlocked) @@ -2674,9 +2673,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&pgdata->split_queue_lock); - count = page_count(head); - mapcount = total_mapcount(head); - if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { + if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { pgdata->split_queue_len--; list_del(page_deferred_list(head)); @@ -2692,16 +2689,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } else ret = 0; } else { - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); - } spin_unlock(&pgdata->split_queue_lock); -fail: if (mapping) +fail: + if (mapping) spin_unlock(&mapping->tree_lock); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); remap_page(head); From 66c488875de24d10c6e2bd26e226a4bf39ae3e1e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:01 -0700 Subject: [PATCH 099/116] mm: page_vma_mapped_walk(): use page for pvmw->page [ Upstream commit f003c03bd29e6f46fef1b9a8e8d636ac732286d5 ] Patch series "mm: page_vma_mapped_walk() cleanup and THP fixes". I've marked all of these for stable: many are merely cleanups, but I think they are much better before the main fix than after. This patch (of 11): page_vma_mapped_walk() cleanup: sometimes the local copy of pvwm->page was used, sometimes pvmw->page itself: use the local copy "page" throughout. Link: https://lkml.kernel.org/r/589b358c-febc-c88e-d4c2-7834b37fa7bf@google.com Link: https://lkml.kernel.org/r/88e67645-f467-c279-bf5e-af4b5c6b13eb@google.com Signed-off-by: Hugh Dickins Reviewed-by: Alistair Popple Acked-by: Kirill A. Shutemov Reviewed-by: Peter Xu Cc: Yang Shi Cc: Wang Yugui Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Zi Yan Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 340207ba3743..3cff784019c1 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -150,7 +150,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pte) goto next_pte; - if (unlikely(PageHuge(pvmw->page))) { + if (unlikely(PageHuge(page))) { /* when pud is not present, pte will be NULL */ pvmw->pte = huge_pte_offset(mm, pvmw->address, PAGE_SIZE << compound_order(page)); @@ -212,8 +212,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) * cannot return prematurely, while zap_huge_pmd() has * cleared *pmd but not decremented compound_mapcount(). */ - if ((pvmw->flags & PVMW_SYNC) && - PageTransCompound(pvmw->page)) { + if ((pvmw->flags & PVMW_SYNC) && PageTransCompound(page)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); spin_unlock(ptl); @@ -229,9 +228,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return true; next_pte: /* Seek to next pte only makes sense for THP */ - if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) + if (!PageTransHuge(page) || PageHuge(page)) return not_found(pvmw); - end = vma_address_end(pvmw->page, pvmw->vma); + end = vma_address_end(page, pvmw->vma); do { pvmw->address += PAGE_SIZE; if (pvmw->address >= end) From 084d41a8294988b1f124e22837f85cb6391ae82b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:04 -0700 Subject: [PATCH 100/116] mm: page_vma_mapped_walk(): settle PageHuge on entry [ Upstream commit 6d0fd5987657cb0c9756ce684e3a74c0f6351728 ] page_vma_mapped_walk() cleanup: get the hugetlbfs PageHuge case out of the way at the start, so no need to worry about it later. Link: https://lkml.kernel.org/r/e31a483c-6d73-a6bb-26c5-43c3b880a2@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Reviewed-by: Peter Xu Cc: Alistair Popple Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 3cff784019c1..bdb63aafc737 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -147,10 +147,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - if (pvmw->pte) - goto next_pte; - if (unlikely(PageHuge(page))) { + /* The only possible mapping was handled on last iteration */ + if (pvmw->pte) + return not_found(pvmw); + /* when pud is not present, pte will be NULL */ pvmw->pte = huge_pte_offset(mm, pvmw->address, PAGE_SIZE << compound_order(page)); @@ -163,6 +164,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return not_found(pvmw); return true; } + + if (pvmw->pte) + goto next_pte; restart: pgd = pgd_offset(mm, pvmw->address); if (!pgd_present(*pgd)) @@ -228,7 +232,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return true; next_pte: /* Seek to next pte only makes sense for THP */ - if (!PageTransHuge(page) || PageHuge(page)) + if (!PageTransHuge(page)) return not_found(pvmw); end = vma_address_end(page, pvmw->vma); do { From 3d98b8080cffa74d1a833a1ca639985573668627 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:07 -0700 Subject: [PATCH 101/116] mm: page_vma_mapped_walk(): use pmde for *pvmw->pmd [ Upstream commit 3306d3119ceacc43ea8b141a73e21fea68eec30c ] page_vma_mapped_walk() cleanup: re-evaluate pmde after taking lock, then use it in subsequent tests, instead of repeatedly dereferencing pointer. Link: https://lkml.kernel.org/r/53fbc9d-891e-46b2-cb4b-468c3b19238e@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Reviewed-by: Peter Xu Cc: Alistair Popple Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index bdb63aafc737..8a6af4007c7e 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -186,18 +186,19 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) pmde = READ_ONCE(*pvmw->pmd); if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { pvmw->ptl = pmd_lock(mm, pvmw->pmd); - if (likely(pmd_trans_huge(*pvmw->pmd))) { + pmde = *pvmw->pmd; + if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (pmd_page(*pvmw->pmd) != page) + if (pmd_page(pmde) != page) return not_found(pvmw); return true; - } else if (!pmd_present(*pvmw->pmd)) { + } else if (!pmd_present(pmde)) { if (thp_migration_supported()) { if (!(pvmw->flags & PVMW_MIGRATION)) return not_found(pvmw); - if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { - swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); + if (is_migration_entry(pmd_to_swp_entry(pmde))) { + swp_entry_t entry = pmd_to_swp_entry(pmde); if (migration_entry_to_page(entry) != page) return not_found(pvmw); From 43d40057fdc5df5f0809ee2a13d436be9adcbc96 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:10 -0700 Subject: [PATCH 102/116] mm: page_vma_mapped_walk(): prettify PVMW_MIGRATION block [ Upstream commit e2e1d4076c77b3671cf8ce702535ae7dee3acf89 ] page_vma_mapped_walk() cleanup: rearrange the !pmd_present() block to follow the same "return not_found, return not_found, return true" pattern as the block above it (note: returning not_found there is never premature, since existence or prior existence of huge pmd guarantees good alignment). Link: https://lkml.kernel.org/r/378c8650-1488-2edf-9647-32a53cf2e21@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Reviewed-by: Peter Xu Cc: Alistair Popple Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8a6af4007c7e..92d7f574b8ab 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -193,24 +193,22 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pmd_page(pmde) != page) return not_found(pvmw); return true; - } else if (!pmd_present(pmde)) { - if (thp_migration_supported()) { - if (!(pvmw->flags & PVMW_MIGRATION)) - return not_found(pvmw); - if (is_migration_entry(pmd_to_swp_entry(pmde))) { - swp_entry_t entry = pmd_to_swp_entry(pmde); + } + if (!pmd_present(pmde)) { + swp_entry_t entry; - if (migration_entry_to_page(entry) != page) - return not_found(pvmw); - return true; - } - } - return not_found(pvmw); - } else { - /* THP pmd was split under us: handle on pte level */ - spin_unlock(pvmw->ptl); - pvmw->ptl = NULL; + if (!thp_migration_supported() || + !(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + entry = pmd_to_swp_entry(pmde); + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; } + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; } else if (!pmd_present(pmde)) { /* * If PVMW_SYNC, take and drop THP pmd lock so that we From 1c1ea4e4397022a89be0699239db1c90cfa5ba48 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:14 -0700 Subject: [PATCH 103/116] mm: page_vma_mapped_walk(): crossing page table boundary [ Upstream commit 448282487483d6fa5b2eeeafaa0acc681e544a9c ] page_vma_mapped_walk() cleanup: adjust the test for crossing page table boundary - I believe pvmw->address is always page-aligned, but nothing else here assumed that; and remember to reset pvmw->pte to NULL after unmapping the page table, though I never saw any bug from that. Link: https://lkml.kernel.org/r/799b3f9c-2a9e-dfef-5d89-26e9f76fd97@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Matthew Wilcox Cc: Peter Xu Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 92d7f574b8ab..2463ba78959b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -239,16 +239,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->address >= end) return not_found(pvmw); /* Did we cross page table boundary? */ - if (pvmw->address % PMD_SIZE == 0) { - pte_unmap(pvmw->pte); + if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { if (pvmw->ptl) { spin_unlock(pvmw->ptl); pvmw->ptl = NULL; } + pte_unmap(pvmw->pte); + pvmw->pte = NULL; goto restart; - } else { - pvmw->pte++; } + pvmw->pte++; } while (pte_none(*pvmw->pte)); if (!pvmw->ptl) { From 72b2b0d093c5a3cf6513c5f4c2aecb6ad8faff2e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:17 -0700 Subject: [PATCH 104/116] mm: page_vma_mapped_walk(): add a level of indentation [ Upstream commit b3807a91aca7d21c05d5790612e49969117a72b9 ] page_vma_mapped_walk() cleanup: add a level of indentation to much of the body, making no functional change in this commit, but reducing the later diff when this is all converted to a loop. [hughd@google.com: : page_vma_mapped_walk(): add a level of indentation fix] Link: https://lkml.kernel.org/r/7f817555-3ce1-c785-e438-87d8efdcaf26@google.com Link: https://lkml.kernel.org/r/efde211-f3e2-fe54-977-ef481419e7f3@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Matthew Wilcox Cc: Peter Xu Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 105 ++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2463ba78959b..911c6dbe85f9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -168,62 +168,67 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pte) goto next_pte; restart: - pgd = pgd_offset(mm, pvmw->address); - if (!pgd_present(*pgd)) - return false; - p4d = p4d_offset(pgd, pvmw->address); - if (!p4d_present(*p4d)) - return false; - pud = pud_offset(p4d, pvmw->address); - if (!pud_present(*pud)) - return false; - pvmw->pmd = pmd_offset(pud, pvmw->address); - /* - * Make sure the pmd value isn't cached in a register by the - * compiler and used as a stale value after we've observed a - * subsequent update. - */ - pmde = READ_ONCE(*pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { - pvmw->ptl = pmd_lock(mm, pvmw->pmd); - pmde = *pvmw->pmd; - if (likely(pmd_trans_huge(pmde))) { - if (pvmw->flags & PVMW_MIGRATION) - return not_found(pvmw); - if (pmd_page(pmde) != page) - return not_found(pvmw); - return true; - } - if (!pmd_present(pmde)) { - swp_entry_t entry; + { + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) + return false; + p4d = p4d_offset(pgd, pvmw->address); + if (!p4d_present(*p4d)) + return false; + pud = pud_offset(p4d, pvmw->address); + if (!pud_present(*pud)) + return false; - if (!thp_migration_supported() || - !(pvmw->flags & PVMW_MIGRATION)) - return not_found(pvmw); - entry = pmd_to_swp_entry(pmde); - if (!is_migration_entry(entry) || - migration_entry_to_page(entry) != page) - return not_found(pvmw); - return true; - } - /* THP pmd was split under us: handle on pte level */ - spin_unlock(pvmw->ptl); - pvmw->ptl = NULL; - } else if (!pmd_present(pmde)) { + pvmw->pmd = pmd_offset(pud, pvmw->address); /* - * If PVMW_SYNC, take and drop THP pmd lock so that we - * cannot return prematurely, while zap_huge_pmd() has - * cleared *pmd but not decremented compound_mapcount(). + * Make sure the pmd value isn't cached in a register by the + * compiler and used as a stale value after we've observed a + * subsequent update. */ - if ((pvmw->flags & PVMW_SYNC) && PageTransCompound(page)) { - spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + pmde = READ_ONCE(*pvmw->pmd); + + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + pmde = *pvmw->pmd; + if (likely(pmd_trans_huge(pmde))) { + if (pvmw->flags & PVMW_MIGRATION) + return not_found(pvmw); + if (pmd_page(pmde) != page) + return not_found(pvmw); + return true; + } + if (!pmd_present(pmde)) { + swp_entry_t entry; + + if (!thp_migration_supported() || + !(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + entry = pmd_to_swp_entry(pmde); + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; + } + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + PageTransCompound(page)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); - spin_unlock(ptl); + spin_unlock(ptl); + } + return false; } - return false; + if (!map_pte(pvmw)) + goto next_pte; } - if (!map_pte(pvmw)) - goto next_pte; while (1) { unsigned long end; From ca054d41da1b96b2c3f1f556efabf19c54abff50 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:20 -0700 Subject: [PATCH 105/116] mm: page_vma_mapped_walk(): use goto instead of while (1) [ Upstream commit 474466301dfd8b39a10c01db740645f3f7ae9a28 ] page_vma_mapped_walk() cleanup: add a label this_pte, matching next_pte, and use "goto this_pte", in place of the "while (1)" loop at the end. Link: https://lkml.kernel.org/r/a52b234a-851-3616-2525-f42736e8934@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Matthew Wilcox Cc: Peter Xu Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 911c6dbe85f9..f6c750539a6b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -138,6 +138,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) { struct mm_struct *mm = pvmw->vma->vm_mm; struct page *page = pvmw->page; + unsigned long end; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -228,10 +229,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) } if (!map_pte(pvmw)) goto next_pte; - } - while (1) { - unsigned long end; - +this_pte: if (check_pte(pvmw)) return true; next_pte: @@ -260,6 +258,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) pvmw->ptl = pte_lockptr(mm, pvmw->pmd); spin_lock(pvmw->ptl); } + goto this_pte; } } From 329d4fb943b042db257f56b00a9da70631d36b3d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:23 -0700 Subject: [PATCH 106/116] mm: page_vma_mapped_walk(): get vma_address_end() earlier [ Upstream commit a765c417d876cc635f628365ec9aa6f09470069a ] page_vma_mapped_walk() cleanup: get THP's vma_address_end() at the start, rather than later at next_pte. It's a little unnecessary overhead on the first call, but makes for a simpler loop in the following commit. Link: https://lkml.kernel.org/r/4542b34d-862f-7cb4-bb22-e0df6ce830a2@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Matthew Wilcox Cc: Peter Xu Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index f6c750539a6b..96d4c4738590 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -166,6 +166,15 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return true; } + /* + * Seek to next pte only makes sense for THP. + * But more important than that optimization, is to filter out + * any PageKsm page: whose page->index misleads vma_address() + * and vma_address_end() to disaster. + */ + end = PageTransCompound(page) ? + vma_address_end(page, pvmw->vma) : + pvmw->address + PAGE_SIZE; if (pvmw->pte) goto next_pte; restart: @@ -233,10 +242,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (check_pte(pvmw)) return true; next_pte: - /* Seek to next pte only makes sense for THP */ - if (!PageTransHuge(page)) - return not_found(pvmw); - end = vma_address_end(page, pvmw->vma); do { pvmw->address += PAGE_SIZE; if (pvmw->address >= end) From 3a5f1cdac2f698a9c708429aa23c3bccd5c1ccee Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:26 -0700 Subject: [PATCH 107/116] mm/thp: fix page_vma_mapped_walk() if THP mapped by ptes [ Upstream commit a9a7504d9beaf395481faa91e70e2fd08f7a3dde ] Running certain tests with a DEBUG_VM kernel would crash within hours, on the total_mapcount BUG() in split_huge_page_to_list(), while trying to free up some memory by punching a hole in a shmem huge page: split's try_to_unmap() was unable to find all the mappings of the page (which, on a !DEBUG_VM kernel, would then keep the huge page pinned in memory). Crash dumps showed two tail pages of a shmem huge page remained mapped by pte: ptes in a non-huge-aligned vma of a gVisor process, at the end of a long unmapped range; and no page table had yet been allocated for the head of the huge page to be mapped into. Although designed to handle these odd misaligned huge-page-mapped-by-pte cases, page_vma_mapped_walk() falls short by returning false prematurely when !pmd_present or !pud_present or !p4d_present or !pgd_present: there are cases when a huge page may span the boundary, with ptes present in the next. Restructure page_vma_mapped_walk() as a loop to continue in these cases, while keeping its layout much as before. Add a step_forward() helper to advance pvmw->address across those boundaries: originally I tried to use mm's standard p?d_addr_end() macros, but hit the same crash 512 times less often: because of the way redundant levels are folded together, but folded differently in different configurations, it was just too difficult to use them correctly; and step_forward() is simpler anyway. Link: https://lkml.kernel.org/r/fedb8632-1798-de42-f39e-873551d5bc81@google.com Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()") Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Alistair Popple Cc: Matthew Wilcox Cc: Peter Xu Cc: Ralph Campbell Cc: Wang Yugui Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 96d4c4738590..16adeef76d00 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -110,6 +110,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) return true; } +static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) +{ + pvmw->address = (pvmw->address + size) & ~(size - 1); + if (!pvmw->address) + pvmw->address = ULONG_MAX; +} + /** * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at * @pvmw->address @@ -178,16 +185,22 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pte) goto next_pte; restart: - { + do { pgd = pgd_offset(mm, pvmw->address); - if (!pgd_present(*pgd)) - return false; + if (!pgd_present(*pgd)) { + step_forward(pvmw, PGDIR_SIZE); + continue; + } p4d = p4d_offset(pgd, pvmw->address); - if (!p4d_present(*p4d)) - return false; + if (!p4d_present(*p4d)) { + step_forward(pvmw, P4D_SIZE); + continue; + } pud = pud_offset(p4d, pvmw->address); - if (!pud_present(*pud)) - return false; + if (!pud_present(*pud)) { + step_forward(pvmw, PUD_SIZE); + continue; + } pvmw->pmd = pmd_offset(pud, pvmw->address); /* @@ -234,7 +247,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) spin_unlock(ptl); } - return false; + step_forward(pvmw, PMD_SIZE); + continue; } if (!map_pte(pvmw)) goto next_pte; @@ -264,7 +278,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) spin_lock(pvmw->ptl); } goto this_pte; - } + } while (pvmw->address < end); + + return false; } /** From fc308458ef456f488d4de30d27eefa0904835d53 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:30 -0700 Subject: [PATCH 108/116] mm/thp: another PVMW_SYNC fix in page_vma_mapped_walk() [ Upstream commit a7a69d8ba88d8dcee7ef00e91d413a4bd003a814 ] Aha! Shouldn't that quick scan over pte_none()s make sure that it holds ptlock in the PVMW_SYNC case? That too might have been responsible for BUGs or WARNs in split_huge_page_to_list() or its unmap_page(), though I've never seen any. Link: https://lkml.kernel.org/r/1bdf384c-8137-a149-2a1e-475a4791c3c@google.com Link: https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/ Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()") Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Tested-by: Wang Yugui Cc: Alistair Popple Cc: Matthew Wilcox Cc: Peter Xu Cc: Ralph Campbell Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- mm/page_vma_mapped.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 16adeef76d00..a612daef5f00 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -271,6 +271,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) goto restart; } pvmw->pte++; + if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } } while (pte_none(*pvmw->pte)); if (!pvmw->ptl) { From c5bb56066fac7d7fdd51f3e8127a9704386ba694 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 24 Jun 2021 18:39:52 -0700 Subject: [PATCH 109/116] mm, futex: fix shared futex pgoff on shmem huge page [ Upstream commit fe19bd3dae3d15d2fbfdb3de8839a6ea0fe94264 ] If more than one futex is placed on a shmem huge page, it can happen that waking the second wakes the first instead, and leaves the second waiting: the key's shared.pgoff is wrong. When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when generating futex_key"), the only shared huge pages came from hugetlbfs, and the code added to deal with its exceptional page->index was put into hugetlb source. Then that was missed when 4.8 added shmem huge pages. page_to_pgoff() is what others use for this nowadays: except that, as currently written, it gives the right answer on hugetlbfs head, but nonsense on hugetlbfs tails. Fix that by calling hugetlbfs-specific hugetlb_basepage_index() on PageHuge tails as well as on head. Yes, it's unconventional to declare hugetlb_basepage_index() there in pagemap.h, rather than in hugetlb.h; but I do not expect anything but page_to_pgoff() ever to need it. [akpm@linux-foundation.org: give hugetlb_basepage_index() prototype the correct scope] Link: https://lkml.kernel.org/r/b17d946b-d09-326e-b42a-52884c36df32@google.com Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Reported-by: Neel Natu Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Thomas Gleixner Cc: "Kirill A. Shutemov" Cc: Zhang Yi Cc: Mel Gorman Cc: Mike Kravetz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Note on stable backport: leave redundant #include in kernel/futex.c, to avoid conflict over the header files included. Signed-off-by: Hugh Dickins Signed-off-by: Sasha Levin --- include/linux/hugetlb.h | 16 ---------------- include/linux/pagemap.h | 13 +++++++------ kernel/futex.c | 2 +- mm/hugetlb.c | 5 +---- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fe0ec0a29db7..d2b5cc8ce54f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -467,17 +467,6 @@ static inline int hstate_index(struct hstate *h) return h - hstates; } -pgoff_t __basepage_index(struct page *page); - -/* Return page->index in PAGE_SIZE units */ -static inline pgoff_t basepage_index(struct page *page) -{ - if (!PageCompound(page)) - return page->index; - - return __basepage_index(page); -} - extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); @@ -572,11 +561,6 @@ static inline int hstate_index(struct hstate *h) return 0; } -static inline pgoff_t basepage_index(struct page *page) -{ - return page->index; -} - static inline int dissolve_free_huge_page(struct page *page) { return 0; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e08b5339023c..84c7fc7f63e7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -399,7 +399,7 @@ static inline struct page *read_mapping_page(struct address_space *mapping, } /* - * Get index of the page with in radix-tree + * Get index of the page within radix-tree (but not for hugetlb pages). * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_index(struct page *page) @@ -418,15 +418,16 @@ static inline pgoff_t page_to_index(struct page *page) return pgoff; } +extern pgoff_t hugetlb_basepage_index(struct page *page); + /* - * Get the offset in PAGE_SIZE. - * (TODO: hugepage should have ->index in PAGE_SIZE) + * Get the offset in PAGE_SIZE (even for hugetlb pages). + * (TODO: hugetlb pages should have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_pgoff(struct page *page) { - if (unlikely(PageHeadHuge(page))) - return page->index << compound_order(page); - + if (unlikely(PageHuge(page))) + return hugetlb_basepage_index(page); return page_to_index(page); } diff --git a/kernel/futex.c b/kernel/futex.c index af1d9a993988..e282c083df59 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -719,7 +719,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = basepage_index(tail); + key->shared.pgoff = page_to_pgoff(tail); rcu_read_unlock(); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0dc181290d1f..c765fd01f0aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1403,15 +1403,12 @@ int PageHeadHuge(struct page *page_head) return get_compound_page_dtor(page_head) == free_huge_page; } -pgoff_t __basepage_index(struct page *page) +pgoff_t hugetlb_basepage_index(struct page *page) { struct page *page_head = compound_head(page); pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (!PageHuge(page_head)) - return page_index(page); - if (compound_order(page_head) >= MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else From 4164c07e5062cba5e6666df81d7fce1e4f54c317 Mon Sep 17 00:00:00 2001 From: ManYi Li Date: Fri, 11 Jun 2021 17:44:02 +0800 Subject: [PATCH 110/116] scsi: sr: Return appropriate error code when disk is ejected [ Upstream commit 7dd753ca59d6c8cc09aa1ed24f7657524803c7f3 ] Handle a reported media event code of 3. This indicates that the media has been removed from the drive and user intervention is required to proceed. Return DISK_EVENT_EJECT_REQUEST in that case. Link: https://lore.kernel.org/r/20210611094402.23884-1-limanyi@uniontech.com Signed-off-by: ManYi Li Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/sr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 5be3d6b7991b..a46fbe2d2ee6 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -216,6 +216,8 @@ static unsigned int sr_get_events(struct scsi_device *sdev) return DISK_EVENT_EJECT_REQUEST; else if (med->media_event_code == 2) return DISK_EVENT_MEDIA_CHANGE; + else if (med->media_event_code == 3) + return DISK_EVENT_EJECT_REQUEST; return 0; } From fb2479ddfb0b1eae0d60868aa444aace4ce6287c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 11 Jun 2021 14:34:50 +0200 Subject: [PATCH 111/116] drm/nouveau: fix dma_address check for CPU/GPU sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d330099115597bbc238d6758a4930e72b49ea9ba ] AGP for example doesn't have a dma_address array. Signed-off-by: Christian König Acked-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20210614110517.1624-1-christian.koenig@amd.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index e427f80344c4..a2d770acd10a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -450,7 +450,7 @@ nouveau_bo_sync_for_device(struct nouveau_bo *nvbo) struct ttm_dma_tt *ttm_dma = (struct ttm_dma_tt *)nvbo->bo.ttm; int i; - if (!ttm_dma) + if (!ttm_dma || !ttm_dma->dma_address) return; /* Don't waste time looping if the object is coherent */ @@ -470,7 +470,7 @@ nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo) struct ttm_dma_tt *ttm_dma = (struct ttm_dma_tt *)nvbo->bo.ttm; int i; - if (!ttm_dma) + if (!ttm_dma || !ttm_dma->dma_address) return; /* Don't waste time looping if the object is coherent */ From d63af6c931f73b4597e37816ed4e77ee690ada82 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 8 Oct 2017 12:12:16 -0400 Subject: [PATCH 112/116] kfifo: DECLARE_KIFO_PTR(fifo, u64) does not work on arm 32 bit commit 8a866fee3909c49738e1c4429a8d2b9bf27e015d upstream. If you try to store u64 in a kfifo (or a struct with u64 members), then the buf member of __STRUCT_KFIFO_PTR will cause 4 bytes padding due to alignment (note that struct __kfifo is 20 bytes on 32 bit). That in turn causes the __is_kfifo_ptr() to fail, which is caught by kfifo_alloc(), which now returns EINVAL. So, ensure that __is_kfifo_ptr() compares to the right structure. Signed-off-by: Sean Young Acked-by: Stefani Seibold Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Matthew Weber Signed-off-by: Greg Kroah-Hartman --- include/linux/kfifo.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 41eb6fdf87a8..86b5fb08e96c 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -113,7 +113,8 @@ struct kfifo_rec_ptr_2 __STRUCT_KFIFO_PTR(unsigned char, 2, void); * array is a part of the structure and the fifo type where the array is * outside of the fifo structure. */ -#define __is_kfifo_ptr(fifo) (sizeof(*fifo) == sizeof(struct __kfifo)) +#define __is_kfifo_ptr(fifo) \ + (sizeof(*fifo) == sizeof(STRUCT_KFIFO_PTR(typeof(*(fifo)->type)))) /** * DECLARE_KFIFO_PTR - macro to declare a fifo pointer object From 5f7c8a41b8a96709c165f41cc793c8a0a6eee160 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 24 Jun 2021 18:39:45 -0700 Subject: [PATCH 113/116] kthread_worker: split code for canceling the delayed work timer commit 34b3d5344719d14fd2185b2d9459b3abcb8cf9d8 upstream. Patch series "kthread_worker: Fix race between kthread_mod_delayed_work() and kthread_cancel_delayed_work_sync()". This patchset fixes the race between kthread_mod_delayed_work() and kthread_cancel_delayed_work_sync() including proper return value handling. This patch (of 2): Simple code refactoring as a preparation step for fixing a race between kthread_mod_delayed_work() and kthread_cancel_delayed_work_sync(). It does not modify the existing behavior. Link: https://lkml.kernel.org/r/20210610133051.15337-2-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Cc: Martin Liu Cc: Minchan Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Oleg Nesterov Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/kthread.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index fd6f9322312a..478400ed9fe9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -978,6 +978,33 @@ void kthread_flush_work(struct kthread_work *work) } EXPORT_SYMBOL_GPL(kthread_flush_work); +/* + * Make sure that the timer is neither set nor running and could + * not manipulate the work list_head any longer. + * + * The function is called under worker->lock. The lock is temporary + * released but the timer can't be set again in the meantime. + */ +static void kthread_cancel_delayed_work_timer(struct kthread_work *work, + unsigned long *flags) +{ + struct kthread_delayed_work *dwork = + container_of(work, struct kthread_delayed_work, work); + struct kthread_worker *worker = work->worker; + + /* + * del_timer_sync() must be called to make sure that the timer + * callback is not running. The lock must be temporary released + * to avoid a deadlock with the callback. In the meantime, + * any queuing is blocked by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, *flags); + del_timer_sync(&dwork->timer); + spin_lock_irqsave(&worker->lock, *flags); + work->canceling--; +} + /* * This function removes the work from the worker queue. Also it makes sure * that it won't get queued later via the delayed work's timer. @@ -992,23 +1019,8 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, unsigned long *flags) { /* Try to cancel the timer if exists. */ - if (is_dwork) { - struct kthread_delayed_work *dwork = - container_of(work, struct kthread_delayed_work, work); - struct kthread_worker *worker = work->worker; - - /* - * del_timer_sync() must be called to make sure that the timer - * callback is not running. The lock must be temporary released - * to avoid a deadlock with the callback. In the meantime, - * any queuing is blocked by setting the canceling counter. - */ - work->canceling++; - spin_unlock_irqrestore(&worker->lock, *flags); - del_timer_sync(&dwork->timer); - spin_lock_irqsave(&worker->lock, *flags); - work->canceling--; - } + if (is_dwork) + kthread_cancel_delayed_work_timer(work, flags); /* * Try to remove the work from a worker list. It might either From 5f0185cd37347267ff06dd61cd0131b27f164ac5 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 24 Jun 2021 18:39:48 -0700 Subject: [PATCH 114/116] kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync() commit 5fa54346caf67b4b1b10b1f390316ae466da4d53 upstream. The system might hang with the following backtrace: schedule+0x80/0x100 schedule_timeout+0x48/0x138 wait_for_common+0xa4/0x134 wait_for_completion+0x1c/0x2c kthread_flush_work+0x114/0x1cc kthread_cancel_work_sync.llvm.16514401384283632983+0xe8/0x144 kthread_cancel_delayed_work_sync+0x18/0x2c xxxx_pm_notify+0xb0/0xd8 blocking_notifier_call_chain_robust+0x80/0x194 pm_notifier_call_chain_robust+0x28/0x4c suspend_prepare+0x40/0x260 enter_state+0x80/0x3f4 pm_suspend+0x60/0xdc state_store+0x108/0x144 kobj_attr_store+0x38/0x88 sysfs_kf_write+0x64/0xc0 kernfs_fop_write_iter+0x108/0x1d0 vfs_write+0x2f4/0x368 ksys_write+0x7c/0xec It is caused by the following race between kthread_mod_delayed_work() and kthread_cancel_delayed_work_sync(): CPU0 CPU1 Context: Thread A Context: Thread B kthread_mod_delayed_work() spin_lock() __kthread_cancel_work() spin_unlock() del_timer_sync() kthread_cancel_delayed_work_sync() spin_lock() __kthread_cancel_work() spin_unlock() del_timer_sync() spin_lock() work->canceling++ spin_unlock spin_lock() queue_delayed_work() // dwork is put into the worker->delayed_work_list spin_unlock() kthread_flush_work() // flush_work is put at the tail of the dwork wait_for_completion() Context: IRQ kthread_delayed_work_timer_fn() spin_lock() list_del_init(&work->node); spin_unlock() BANG: flush_work is not longer linked and will never get proceed. The problem is that kthread_mod_delayed_work() checks work->canceling flag before canceling the timer. A simple solution is to (re)check work->canceling after __kthread_cancel_work(). But then it is not clear what should be returned when __kthread_cancel_work() removed the work from the queue (list) and it can't queue it again with the new @delay. The return value might be used for reference counting. The caller has to know whether a new work has been queued or an existing one was replaced. The proper solution is that kthread_mod_delayed_work() will remove the work from the queue (list) _only_ when work->canceling is not set. The flag must be checked after the timer is stopped and the remaining operations can be done under worker->lock. Note that kthread_mod_delayed_work() could remove the timer and then bail out. It is fine. The other canceling caller needs to cancel the timer as well. The important thing is that the queue (list) manipulation is done atomically under worker->lock. Link: https://lkml.kernel.org/r/20210610133051.15337-3-pmladek@suse.com Fixes: 9a6b06c8d9a220860468a ("kthread: allow to modify delayed kthread work") Signed-off-by: Petr Mladek Reported-by: Martin Liu Cc: Cc: Minchan Kim Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Oleg Nesterov Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/kthread.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 478400ed9fe9..7dd2c8a797d7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1006,8 +1006,11 @@ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, } /* - * This function removes the work from the worker queue. Also it makes sure - * that it won't get queued later via the delayed work's timer. + * This function removes the work from the worker queue. + * + * It is called under worker->lock. The caller must make sure that + * the timer used by delayed work is not running, e.g. by calling + * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. @@ -1015,13 +1018,8 @@ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ -static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, - unsigned long *flags) +static bool __kthread_cancel_work(struct kthread_work *work) { - /* Try to cancel the timer if exists. */ - if (is_dwork) - kthread_cancel_delayed_work_timer(work, flags); - /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. @@ -1074,11 +1072,23 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); - /* Do not fight with another command that is canceling this work. */ + /* + * Temporary cancel the work but do not fight with another command + * that is canceling the work as well. + * + * It is a bit tricky because of possible races with another + * mod_delayed_work() and cancel_delayed_work() callers. + * + * The timer must be canceled first because worker->lock is released + * when doing so. But the work can be removed from the queue (list) + * only when it can be queued again so that the return value can + * be used for reference counting. + */ + kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) goto out; + ret = __kthread_cancel_work(work); - ret = __kthread_cancel_work(work, true, &flags); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: @@ -1100,7 +1110,10 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); - ret = __kthread_cancel_work(work, is_dwork, &flags); + if (is_dwork) + kthread_cancel_delayed_work_timer(work, &flags); + + ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; From c75310b5e17d0369fec2ab28c748fccf1c2b626f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 23 Jun 2021 15:09:13 +0200 Subject: [PATCH 115/116] xen/events: reset active flag for lateeoi events later commit 3de218ff39b9e3f0d453fe3154f12a174de44b25 upstream. In order to avoid a race condition for user events when changing cpu affinity reset the active flag only when EOI-ing the event. This is working fine as all user events are lateeoi events. Note that lateeoi_ack_mask_dynirq() is not modified as there is no explicit call to xen_irq_lateeoi() expected later. Cc: stable@vger.kernel.org Reported-by: Julien Grall Fixes: b6622798bc50b62 ("xen/events: avoid handling the same event on two cpus at the same time") Tested-by: Julien Grall Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210623130913.9405-1-jgross@suse.com Signed-off-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman --- drivers/xen/events/events_base.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b370144682ed..a2f8130e18fe 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -524,6 +524,9 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) } info->eoi_time = 0; + + /* is_active hasn't been reset yet, do it now. */ + smp_store_release(&info->is_active, 0); do_unmask(info, EVT_MASK_REASON_EOI_PENDING); } @@ -1780,10 +1783,22 @@ static void lateeoi_ack_dynirq(struct irq_data *data) struct irq_info *info = info_for_irq(data->irq); evtchn_port_t evtchn = info ? info->evtchn : 0; - if (VALID_EVTCHN(evtchn)) { - do_mask(info, EVT_MASK_REASON_EOI_PENDING); - ack_dynirq(data); - } + if (!VALID_EVTCHN(evtchn)) + return; + + do_mask(info, EVT_MASK_REASON_EOI_PENDING); + + if (unlikely(irqd_is_setaffinity_pending(data)) && + likely(!irqd_irq_disabled(data))) { + do_mask(info, EVT_MASK_REASON_TEMPORARY); + + clear_evtchn(evtchn); + + irq_move_masked_irq(data); + + do_unmask(info, EVT_MASK_REASON_TEMPORARY); + } else + clear_evtchn(evtchn); } static void lateeoi_mask_ack_dynirq(struct irq_data *data) From 4e68c9b0763ff55eaa69d6e519f07515f1c9037b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 11 Jul 2021 12:48:13 +0200 Subject: [PATCH 116/116] Linux 4.14.239 Link: https://lore.kernel.org/r/20210709131627.928131764@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5442918651e0..3bb379664a96 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 238 +SUBLEVEL = 239 EXTRAVERSION = NAME = Petit Gorille