Merge branch 'openwrt:master' into master
This commit is contained in:
commit
39ac71810a
302 changed files with 20809 additions and 6846 deletions
|
@ -124,13 +124,6 @@ ifneq ($(filter libtool-abiver,$(PKG_FIXUP)),)
|
|||
Hooks/Configure/Post += set_libtool_abiver
|
||||
endif
|
||||
|
||||
ifneq ($(filter libtool-ucxx,$(PKG_FIXUP)),)
|
||||
PKG_BUILD_DEPENDS += libtool
|
||||
ifeq ($(filter no-autoreconf,$(PKG_FIXUP)),)
|
||||
Hooks/Configure/Pre += autoreconf_target
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(filter autoreconf,$(PKG_FIXUP)),)
|
||||
ifeq ($(filter autoreconf,$(Hooks/Configure/Pre)),)
|
||||
Hooks/Configure/Pre += autoreconf_target
|
||||
|
@ -166,12 +159,6 @@ ifneq ($(filter libtool,$(HOST_FIXUP)),)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(filter libtool-ucxx,$(HOST_FIXUP)),)
|
||||
ifeq ($(filter no-autoreconf,$(HOST_FIXUP)),)
|
||||
Hooks/HostConfigure/Pre += autoreconf_host
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(filter autoreconf,$(HOST_FIXUP)),)
|
||||
ifeq ($(filter autoreconf,$(Hooks/HostConfigure/Pre)),)
|
||||
Hooks/HostConfigure/Pre += autoreconf_host
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
LINUX_VERSION-5.10 = .175
|
||||
LINUX_KERNEL_HASH-5.10.175 = e277562e28f234e36665ae12b7585f9557a83a86bc4a8de8840a305af6307bce
|
||||
LINUX_VERSION-5.10 = .176
|
||||
LINUX_KERNEL_HASH-5.10.176 = ce072c60ba04173e05b2a1de3fefdeba5ac8b28b1958d92d21bdbf9b736ef793
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
LINUX_VERSION-5.15 = .102
|
||||
LINUX_KERNEL_HASH-5.15.102 = 441cddfb970b97759eebdb9b142673662ce0770500e3ae8bcd4b90af369b01e6
|
||||
LINUX_VERSION-5.15 = .104
|
||||
LINUX_KERNEL_HASH-5.15.104 = 71c532ce09992e470f3259ffeb38d2b5bba990c243a559e4726a57412bd36b54
|
||||
|
|
|
@ -88,6 +88,18 @@ ziking,cpe46b|\
|
|||
zyxel,nbg6616)
|
||||
ubootenv_add_uci_config "/dev/mtd1" "0x0" "0x10000" "0x10000"
|
||||
;;
|
||||
aruba,ap-105|\
|
||||
aruba,ap-175|\
|
||||
dongwon,dw02-412h-64m|\
|
||||
dongwon,dw02-412h-128m|\
|
||||
glinet,gl-ar300m-lite|\
|
||||
glinet,gl-ar300m-nand|\
|
||||
glinet,gl-ar300m-nor|\
|
||||
glinet,gl-ar300m16)
|
||||
idx="$(find_mtd_index u-boot-env)"
|
||||
[ -n "$idx" ] && \
|
||||
ubootenv_add_uci_config "/dev/mtd$idx" "0x0" "0x10000" "0x10000"
|
||||
;;
|
||||
buffalo,wzr-hp-ag300h)
|
||||
ubootenv_add_uci_config "/dev/mtd3" "0x0" "0x10000" "0x10000"
|
||||
;;
|
||||
|
@ -99,16 +111,6 @@ linksys,ea4500-v3)
|
|||
domywifi,dw33d)
|
||||
ubootenv_add_uci_config "/dev/mtd4" "0x0" "0x10000" "0x10000"
|
||||
;;
|
||||
dongwon,dw02-412h-64m|\
|
||||
dongwon,dw02-412h-128m|\
|
||||
glinet,gl-ar300m-lite|\
|
||||
glinet,gl-ar300m-nand|\
|
||||
glinet,gl-ar300m-nor|\
|
||||
glinet,gl-ar300m16)
|
||||
idx="$(find_mtd_index u-boot-env)"
|
||||
[ -n "$idx" ] && \
|
||||
ubootenv_add_uci_config "/dev/mtd$idx" "0x0" "0x10000" "0x10000"
|
||||
;;
|
||||
glinet,gl-ar150)
|
||||
ubootenv_add_uci_config "/dev/mtd1" "0x0" "0x8000" "0x10000"
|
||||
;;
|
||||
|
|
|
@ -19,7 +19,8 @@ alfa-network,r36m-e4g|\
|
|||
alfa-network,tube-e4g|\
|
||||
engenius,epg600|\
|
||||
engenius,esr600h|\
|
||||
sitecom,wlr-4100-v1-002)
|
||||
sitecom,wlr-4100-v1-002|\
|
||||
zyxel,keenetic-lite-iii-a)
|
||||
ubootenv_add_uci_config "/dev/mtd1" "0x0" "0x1000" "0x1000"
|
||||
;;
|
||||
arcadyan,we420223-99)
|
||||
|
|
|
@ -6,9 +6,9 @@ PKG_RELEASE:=1
|
|||
|
||||
PKG_SOURCE_PROTO:=git
|
||||
PKG_SOURCE_URL=$(PROJECT_GIT)/project/firmware/qca-wireless.git
|
||||
PKG_SOURCE_DATE:=2023-03-20
|
||||
PKG_SOURCE_VERSION:=f9cece02724b8ca2c1a166a46f0afa89e632d431
|
||||
PKG_MIRROR_HASH:=89c20798c7ec83114aa69467f2467fe32cbb74ebeca277c60a033af960ca6c04
|
||||
PKG_SOURCE_DATE:=2023-03-27
|
||||
PKG_SOURCE_VERSION:=ccd7e460cc798d90148a10539b6d94a5fd761004
|
||||
PKG_MIRROR_HASH:=e51d28c741aeb0867493a7bfc801b8b1977c942ed5d51d62c1aa8729c91cce32
|
||||
|
||||
PKG_FLAGS:=nonshared
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
From: Felix Fietkau <nbd@nbd.name>
|
||||
Date: Fri, 24 Mar 2023 13:04:17 +0100
|
||||
Subject: [PATCH] wifi: mac80211: fix invalid drv_sta_pre_rcu_remove calls for
|
||||
non-uploaded sta
|
||||
|
||||
Avoid potential data corruption issues caused by uninitialized driver
|
||||
private data structures.
|
||||
|
||||
Reported-by: Brian Coverstone <brian@mainsequence.net>
|
||||
Fixes: 6a9d1b91f34d ("mac80211: add pre-RCU-sync sta removal driver operation")
|
||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||
---
|
||||
|
||||
--- a/net/mac80211/sta_info.c
|
||||
+++ b/net/mac80211/sta_info.c
|
||||
@@ -1241,7 +1241,8 @@ static int __must_check __sta_info_destr
|
||||
list_del_rcu(&sta->list);
|
||||
sta->removed = true;
|
||||
|
||||
- drv_sta_pre_rcu_remove(local, sta->sdata, sta);
|
||||
+ if (sta->uploaded)
|
||||
+ drv_sta_pre_rcu_remove(local, sta->sdata, sta);
|
||||
|
||||
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
|
||||
rcu_access_pointer(sdata->u.vlan.sta) == sta)
|
|
@ -0,0 +1,50 @@
|
|||
From: Felix Fietkau <nbd@nbd.name>
|
||||
Date: Sun, 26 Mar 2023 17:11:34 +0200
|
||||
Subject: [PATCH] wifi: mac80211: fix receiving mesh packets in forwarding=0
|
||||
networks
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When forwarding is set to 0, frames are typically sent with ttl=1.
|
||||
Move the ttl decrement check below the check for local receive in order to
|
||||
fix packet drops.
|
||||
|
||||
Reported-by: Thomas Hühn <thomas.huehn@hs-nordhausen.de>
|
||||
Reported-by: Nick Hainke <vincent@systemli.org>
|
||||
Fixes: 986e43b19ae9 ("wifi: mac80211: fix receiving A-MSDU frames on mesh interfaces")
|
||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||
---
|
||||
|
||||
--- a/net/mac80211/rx.c
|
||||
+++ b/net/mac80211/rx.c
|
||||
@@ -2828,14 +2828,6 @@ ieee80211_rx_mesh_data(struct ieee80211_
|
||||
if (sdata->crypto_tx_tailroom_needed_cnt)
|
||||
tailroom = IEEE80211_ENCRYPT_TAILROOM;
|
||||
|
||||
- if (!--mesh_hdr->ttl) {
|
||||
- if (multicast)
|
||||
- goto rx_accept;
|
||||
-
|
||||
- IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
|
||||
- return RX_DROP_MONITOR;
|
||||
- }
|
||||
-
|
||||
if (mesh_hdr->flags & MESH_FLAGS_AE) {
|
||||
struct mesh_path *mppath;
|
||||
char *proxied_addr;
|
||||
@@ -2874,6 +2866,14 @@ ieee80211_rx_mesh_data(struct ieee80211_
|
||||
if (ether_addr_equal(sdata->vif.addr, eth->h_dest))
|
||||
goto rx_accept;
|
||||
|
||||
+ if (!--mesh_hdr->ttl) {
|
||||
+ if (multicast)
|
||||
+ goto rx_accept;
|
||||
+
|
||||
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
|
||||
+ return RX_DROP_MONITOR;
|
||||
+ }
|
||||
+
|
||||
if (!ifmsh->mshcfg.dot11MeshForwarding) {
|
||||
if (is_multicast_ether_addr(eth->h_dest))
|
||||
goto rx_accept;
|
|
@ -9,7 +9,7 @@ include $(TOPDIR)/rules.mk
|
|||
|
||||
PKG_NAME:=util-linux
|
||||
PKG_VERSION:=2.38.1
|
||||
PKG_RELEASE:=1
|
||||
PKG_RELEASE:=2
|
||||
|
||||
PKG_SOURCE:=$(PKG_NAME)-$(PKG_VERSION).tar.xz
|
||||
PKG_SOURCE_URL:=@KERNEL/linux/utils/$(PKG_NAME)/v2.38
|
||||
|
@ -415,6 +415,17 @@ define Package/rename/description
|
|||
expression in their name by replacement
|
||||
endef
|
||||
|
||||
define Package/rev
|
||||
$(call Package/util-linux/Default)
|
||||
TITLE:=Reverse lines characterwise
|
||||
endef
|
||||
|
||||
define Package/rev/description
|
||||
rev utility copies the specified files to the standard output, reversing the
|
||||
order of characters in every line. If no files are specified, the standard
|
||||
input is read.
|
||||
endef
|
||||
|
||||
define Package/partx-utils
|
||||
$(call Package/util-linux/Default)
|
||||
TITLE:=inform kernel about the presence and numbering of on-disk partitions
|
||||
|
@ -804,6 +815,11 @@ define Package/rename/install
|
|||
$(INSTALL_BIN) $(PKG_INSTALL_DIR)/usr/bin/rename $(1)/usr/bin/
|
||||
endef
|
||||
|
||||
define Package/rev/install
|
||||
$(INSTALL_DIR) $(1)/usr/bin
|
||||
$(INSTALL_BIN) $(PKG_INSTALL_DIR)/usr/bin/rev $(1)/usr/bin/
|
||||
endef
|
||||
|
||||
define Package/partx-utils/install
|
||||
$(INSTALL_DIR) $(1)/usr/sbin
|
||||
$(INSTALL_BIN) $(PKG_INSTALL_DIR)/usr/sbin/partx $(1)/usr/sbin/
|
||||
|
@ -904,6 +920,7 @@ $(eval $(call BuildPackage,namei))
|
|||
$(eval $(call BuildPackage,nsenter))
|
||||
$(eval $(call BuildPackage,prlimit))
|
||||
$(eval $(call BuildPackage,rename))
|
||||
$(eval $(call BuildPackage,rev))
|
||||
$(eval $(call BuildPackage,partx-utils))
|
||||
$(eval $(call BuildPackage,script-utils))
|
||||
$(eval $(call BuildPackage,setterm))
|
||||
|
|
1
rules.mk
1
rules.mk
|
@ -252,6 +252,7 @@ TARGET_NM:=$(TARGET_CROSS)gcc-nm
|
|||
TARGET_CC:=$(TARGET_CROSS)gcc
|
||||
TARGET_CXX:=$(TARGET_CROSS)g++
|
||||
KPATCH:=$(SCRIPT_DIR)/patch-kernel.sh
|
||||
FILECMD:=$(STAGING_DIR_HOST)/bin/file
|
||||
SED:=$(STAGING_DIR_HOST)/bin/sed -i -e
|
||||
ESED:=$(STAGING_DIR_HOST)/bin/sed -E -i -e
|
||||
MKHASH:=$(STAGING_DIR_HOST)/bin/mkhash
|
||||
|
|
|
@ -51,7 +51,7 @@ define Device/meraki_mx60
|
|||
IMAGES := sysupgrade.bin
|
||||
DTB_SIZE := 20480
|
||||
IMAGE_SIZE := 1021m
|
||||
KERNEL := kernel-bin | gzip | dtb | MuImage-initramfs gzip
|
||||
KERNEL := kernel-bin | libdeflate-gzip | dtb | MuImage-initramfs gzip
|
||||
IMAGE/sysupgrade.bin := sysupgrade-tar | append-metadata
|
||||
UBINIZE_OPTS := -E 5
|
||||
DEVICE_COMPAT_VERSION := 2.0
|
||||
|
@ -70,7 +70,7 @@ define Device/netgear_wndap6x0
|
|||
IMAGE_SIZE := 27392k
|
||||
IMAGES := sysupgrade.bin factory.img
|
||||
KERNEL_SIZE := 6080k
|
||||
KERNEL := dtb | kernel-bin | gzip | MuImage-initramfs gzip
|
||||
KERNEL := dtb | kernel-bin | libdeflate-gzip | MuImage-initramfs gzip
|
||||
IMAGE/sysupgrade.bin := sysupgrade-tar | append-metadata
|
||||
IMAGE/factory.img := append-kernel | pad-to $$$$(KERNEL_SIZE) | append-ubi
|
||||
UBINIZE_OPTS := -E 5
|
||||
|
@ -114,7 +114,7 @@ define Device/netgear_wndr4700
|
|||
# CHECK_DNI_FIRMWARE_ROOTFS_INTEGRITY in do_chk_dniimg()
|
||||
KERNEL := kernel-bin | lzma -d16 | uImage lzma | pad-offset $$(BLOCKSIZE) 64 | \
|
||||
append-uImage-fakehdr filesystem | dtb | create-uImage-dtb | prepend-dtb
|
||||
KERNEL_INITRAMFS := kernel-bin | gzip | dtb | MuImage-initramfs gzip
|
||||
KERNEL_INITRAMFS := kernel-bin | libdeflate-gzip | dtb | MuImage-initramfs gzip
|
||||
IMAGE/factory.img := append-kernel | pad-to $$$$(KERNEL_SIZE) | append-ubi | \
|
||||
netgear-dni | check-size
|
||||
IMAGE/sysupgrade.bin := sysupgrade-tar | append-metadata
|
||||
|
|
|
@ -12,14 +12,14 @@ define Device/wd_mybooklive
|
|||
SUPPORTED_DEVICES += mbl wd,mybooklive-duo
|
||||
BLOCKSIZE := 1k
|
||||
DTB_SIZE := 16384
|
||||
KERNEL := kernel-bin | dtb | gzip | uImage gzip
|
||||
KERNEL_INITRAMFS := kernel-bin | gzip | dtb | MuImage-initramfs gzip
|
||||
KERNEL := kernel-bin | dtb | libdeflate-gzip | uImage gzip
|
||||
KERNEL_INITRAMFS := kernel-bin | libdeflate-gzip | dtb | MuImage-initramfs gzip
|
||||
IMAGES := factory.img.gz sysupgrade.img.gz
|
||||
ARTIFACTS := apollo3g.dtb
|
||||
DEVICE_DTB := apollo3g.dtb
|
||||
FILESYSTEMS := ext4 squashfs
|
||||
IMAGE/factory.img.gz := boot-script | boot-img | hdd-img | gzip
|
||||
IMAGE/sysupgrade.img.gz := boot-script | boot-img | hdd-img | gzip | append-metadata
|
||||
IMAGE/factory.img.gz := boot-script | boot-img | hdd-img | libdeflate-gzip
|
||||
IMAGE/sysupgrade.img.gz := boot-script | boot-img | hdd-img | libdeflate-gzip | append-metadata
|
||||
ARTIFACT/apollo3g.dtb := export-dtb
|
||||
endef
|
||||
|
||||
|
|
244
target/linux/ath79/dts/ar7161_aruba_ap-175.dts
Normal file
244
target/linux/ath79/dts/ar7161_aruba_ap-175.dts
Normal file
|
@ -0,0 +1,244 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later OR MIT
|
||||
|
||||
#include "ar7100.dtsi"
|
||||
|
||||
#include <dt-bindings/gpio/gpio.h>
|
||||
#include <dt-bindings/input/input.h>
|
||||
|
||||
/ {
|
||||
compatible = "aruba,ap-175", "qca,ar7161";
|
||||
model = "Aruba AP-175";
|
||||
|
||||
chosen {
|
||||
bootargs = "console=ttyS0,115200";
|
||||
};
|
||||
|
||||
aliases {
|
||||
led-boot = &led_power_amber;
|
||||
led-failsafe = &led_power_amber;
|
||||
led-upgrade = &led_power_amber;
|
||||
label-mac-device = ð0;
|
||||
};
|
||||
|
||||
leds {
|
||||
compatible = "gpio-leds";
|
||||
|
||||
/* These internal LEDs cannot be seen when case is closed */
|
||||
internal_2g_green {
|
||||
label = "green:internal_2g";
|
||||
gpios = <&gpio 3 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
internal_5g_green {
|
||||
label = "green:internal_5g";
|
||||
gpios = <&gpio 4 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
/* These external LEDs are visible from the bottom panel */
|
||||
|
||||
led_power_amber: power_amber {
|
||||
label = "amber:power";
|
||||
gpios = <&gpio_ext 5 GPIO_ACTIVE_HIGH>;
|
||||
panic-indicator;
|
||||
};
|
||||
|
||||
r1_act_blue {
|
||||
label = "blue:r1_act";
|
||||
gpios = <&gpio_ext 0 GPIO_ACTIVE_HIGH>;
|
||||
linux,default-trigger = "phy1tpt";
|
||||
};
|
||||
|
||||
r1_rssi1_blue {
|
||||
label = "blue:r1_rssi1";
|
||||
gpios = <&gpio_ext 1 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r1_rssi2_blue {
|
||||
label = "blue:r1_rssi2";
|
||||
gpios = <&gpio_ext 2 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r1_rssi3_blue {
|
||||
label = "blue:r1_rssi3";
|
||||
gpios = <&gpio_ext 3 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r1_rssi4_blue {
|
||||
label = "blue:r1_rssi4";
|
||||
gpios = <&gpio_ext 4 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r0_act_amber {
|
||||
label = "amber:r0_act";
|
||||
gpios = <&gpio_ext 8 GPIO_ACTIVE_HIGH>;
|
||||
linux,default-trigger = "phy0tpt";
|
||||
};
|
||||
|
||||
r0_rssi1_amber {
|
||||
label = "amber:r0_rssi1";
|
||||
gpios = <&gpio_ext 9 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r0_rssi2_amber {
|
||||
label = "amber:r0_rssi2";
|
||||
gpios = <&gpio_ext 10 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r0_rssi3_amber {
|
||||
label = "amber:r0_rssi3";
|
||||
gpios = <&gpio_ext 11 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
|
||||
r0_rssi4_amber {
|
||||
label = "amber:r0_rssi4";
|
||||
gpios = <&gpio_ext 12 GPIO_ACTIVE_HIGH>;
|
||||
};
|
||||
};
|
||||
|
||||
keys {
|
||||
compatible = "gpio-keys";
|
||||
|
||||
reset {
|
||||
label = "reset";
|
||||
linux,code = <KEY_RESTART>;
|
||||
gpios = <&gpio 6 GPIO_ACTIVE_LOW>;
|
||||
};
|
||||
};
|
||||
|
||||
i2c0: i2c {
|
||||
compatible = "i2c-gpio";
|
||||
i2c-gpio,delay-us = <10>;
|
||||
i2c-gpio,timeout-ms = <1>;
|
||||
sda-gpios = <&gpio 1 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
scl-gpios = <&gpio 2 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
|
||||
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
};
|
||||
};
|
||||
|
||||
&pcie0 {
|
||||
status = "okay";
|
||||
|
||||
ath9k0: wifi@0,11 {
|
||||
compatible = "pci168c,0029";
|
||||
nvmem-cells = <&macaddr_hwinfo_1c>;
|
||||
nvmem-cell-names = "mac-address";
|
||||
mac-address-increment = <1>;
|
||||
reg = <0x8800 0 0 0 0>;
|
||||
#gpio-cells = <2>;
|
||||
gpio-controller;
|
||||
};
|
||||
|
||||
ath9k1: wifi@0,12 {
|
||||
compatible = "pci168c,0029";
|
||||
nvmem-cells = <&macaddr_hwinfo_1c>;
|
||||
nvmem-cell-names = "mac-address";
|
||||
mac-address-increment = <2>;
|
||||
reg = <0x9000 0 0 0 0>;
|
||||
#gpio-cells = <2>;
|
||||
gpio-controller;
|
||||
};
|
||||
};
|
||||
|
||||
&mdio0 {
|
||||
status = "okay";
|
||||
|
||||
phy1: ethernet-phy@1 {
|
||||
reg = <0x1>;
|
||||
};
|
||||
};
|
||||
|
||||
ð0 {
|
||||
status = "okay";
|
||||
nvmem-cells = <&macaddr_hwinfo_1c>;
|
||||
nvmem-cell-names = "mac-address";
|
||||
|
||||
phy-mode = "rgmii";
|
||||
phy-handle = <&phy1>;
|
||||
};
|
||||
|
||||
&spi {
|
||||
status = "okay";
|
||||
|
||||
flash@0 {
|
||||
compatible = "jedec,spi-nor";
|
||||
reg = <0>;
|
||||
spi-max-frequency = <25000000>;
|
||||
|
||||
partitions {
|
||||
compatible = "fixed-partitions";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
|
||||
partition@0 {
|
||||
label = "u-boot";
|
||||
reg = <0x000000 0x40000>;
|
||||
read-only;
|
||||
};
|
||||
|
||||
partition@40000 {
|
||||
label = "firmware";
|
||||
reg = <0x40000 0xfa0000>;
|
||||
compatible = "denx,uimage";
|
||||
};
|
||||
|
||||
hwinfo: partition@fe0000 {
|
||||
label = "hwinfo";
|
||||
reg = <0xfe0000 0x10000>;
|
||||
read-only;
|
||||
};
|
||||
|
||||
partition@ff0000 {
|
||||
label = "u-boot-env";
|
||||
reg = <0xff0000 0x10000>;
|
||||
read-only;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
&hwinfo {
|
||||
compatible = "nvmem-cells";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
|
||||
macaddr_hwinfo_1c: macaddr@1c {
|
||||
reg = <0x1c 0x6>;
|
||||
};
|
||||
};
|
||||
|
||||
&i2c0 {
|
||||
gpio_ext: gpio@21 {
|
||||
status = "okay";
|
||||
|
||||
compatible = "ti,tca6416";
|
||||
reg = <0x21>;
|
||||
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
|
||||
gpio-controller;
|
||||
#gpio-cells = <2>;
|
||||
};
|
||||
|
||||
temp-sensor@4a {
|
||||
compatible = "national,lm75";
|
||||
reg = <0x4a>;
|
||||
};
|
||||
|
||||
eeprom@50 { /* 24lc2561 */
|
||||
compatible = "atmel,24c256","at24";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0x50>;
|
||||
size = <256>;
|
||||
};
|
||||
|
||||
ds1374c: rtc@68 {
|
||||
status = "okay";
|
||||
|
||||
compatible = "dallas,ds1374";
|
||||
reg = <0x68>;
|
||||
};
|
||||
};
|
|
@ -139,7 +139,8 @@
|
|||
ath9k0: wifi@0,11 {
|
||||
compatible = "pci168c,0029";
|
||||
reg = <0x8800 0 0 0 0>;
|
||||
qca,no-eeprom;
|
||||
nvmem-cells = <&macaddr_lan>, <&cal_art_1000>;
|
||||
nvmem-cell-names = "mac-address-ascii", "calibration";
|
||||
#gpio-cells = <2>;
|
||||
gpio-controller;
|
||||
};
|
||||
|
@ -147,7 +148,9 @@
|
|||
ath9k1: wifi@0,12 {
|
||||
compatible = "pci168c,0029";
|
||||
reg = <0x9000 0 0 0 0>;
|
||||
qca,no-eeprom;
|
||||
nvmem-cells = <&macaddr_wan>, <&cal_art_5000>;
|
||||
nvmem-cell-names = "mac-address-ascii", "calibration";
|
||||
mac-address-increment = <1>;
|
||||
#gpio-cells = <2>;
|
||||
gpio-controller;
|
||||
};
|
||||
|
@ -184,9 +187,28 @@
|
|||
};
|
||||
|
||||
partition@660000 {
|
||||
compatible = "nvmem-cells";
|
||||
label = "caldata";
|
||||
reg = <0x660000 0x010000>;
|
||||
read-only;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
|
||||
cal_art_1000: cal@1000 {
|
||||
reg = <0x1000 0xeb8>;
|
||||
};
|
||||
|
||||
cal_art_5000: cal@5000 {
|
||||
reg = <0x5000 0xeb8>;
|
||||
};
|
||||
|
||||
macaddr_lan: macaddr@ffa0 {
|
||||
reg = <0xffa0 0x11>;
|
||||
};
|
||||
|
||||
macaddr_wan: macaddr@ffb4 {
|
||||
reg = <0xffb4 0x11>;
|
||||
};
|
||||
};
|
||||
|
||||
fwconcat1: partition@670000 {
|
||||
|
@ -202,6 +224,9 @@
|
|||
|
||||
pll-data = <0x11110000 0x00001099 0x00991099>;
|
||||
|
||||
nvmem-cells = <&macaddr_lan>;
|
||||
nvmem-cell-names = "mac-address-ascii";
|
||||
|
||||
fixed-link {
|
||||
speed = <1000>;
|
||||
full-duplex;
|
||||
|
@ -213,5 +238,9 @@
|
|||
|
||||
pll-data = <0x11110000 0x00001099 0x00991099>;
|
||||
|
||||
nvmem-cells = <&macaddr_wan>;
|
||||
nvmem-cell-names = "mac-address-ascii";
|
||||
|
||||
phy-handle = <&phy4>;
|
||||
};
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ ath79_setup_interfaces()
|
|||
araknis,an-700-ap-i-ac|\
|
||||
arduino,yun|\
|
||||
aruba,ap-105|\
|
||||
aruba,ap-175|\
|
||||
asus,rp-ac51|\
|
||||
asus,rp-ac66|\
|
||||
avm,fritz1750e|\
|
||||
|
@ -634,7 +635,6 @@ ath79_setup_macs()
|
|||
dlink,dir-629-a1)
|
||||
wan_mac=$(mtd_get_mac_text "mfcdata" 0x6a)
|
||||
;;
|
||||
dlink,dir-825-b1|\
|
||||
trendnet,tew-673gru)
|
||||
lan_mac=$(mtd_get_mac_text "caldata" 0xffa0)
|
||||
wan_mac=$(mtd_get_mac_text "caldata" 0xffb4)
|
||||
|
|
|
@ -124,7 +124,6 @@ case "$FIRMWARE" in
|
|||
buffalo,wzr-hp-ag300h)
|
||||
caldata_extract "art" 0x1000 0xeb8
|
||||
;;
|
||||
dlink,dir-825-b1|\
|
||||
trendnet,tew-673gru)
|
||||
caldata_extract "caldata" 0x1000 0xeb8
|
||||
ath9k_patch_mac_crc $(mtd_get_mac_text "caldata" 0xffa0) 0x20c
|
||||
|
@ -143,7 +142,6 @@ case "$FIRMWARE" in
|
|||
buffalo,wzr-hp-ag300h)
|
||||
caldata_extract "art" 0x5000 0xeb8
|
||||
;;
|
||||
dlink,dir-825-b1|\
|
||||
trendnet,tew-673gru)
|
||||
caldata_extract "caldata" 0x5000 0xeb8
|
||||
ath9k_patch_mac_crc $(macaddr_add $(mtd_get_mac_text "caldata" 0xffb4) 1) 0x20c
|
||||
|
|
|
@ -395,6 +395,15 @@ define Device/aruba_ap-105
|
|||
endef
|
||||
TARGET_DEVICES += aruba_ap-105
|
||||
|
||||
define Device/aruba_ap-175
|
||||
SOC := ar7161
|
||||
DEVICE_VENDOR := Aruba
|
||||
DEVICE_MODEL := AP-175
|
||||
IMAGE_SIZE := 16000k
|
||||
DEVICE_PACKAGES := kmod-gpio-pca953x kmod-hwmon-lm75 kmod-i2c-gpio kmod-rtc-ds1374
|
||||
endef
|
||||
TARGET_DEVICES += aruba_ap-175
|
||||
|
||||
define Device/asus_pl-ac56
|
||||
SOC := qca9563
|
||||
DEVICE_VENDOR := ASUS
|
||||
|
|
|
@ -14,7 +14,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
|
||||
--- a/drivers/net/usb/lan78xx.c
|
||||
+++ b/drivers/net/usb/lan78xx.c
|
||||
@@ -2716,6 +2716,11 @@ static int lan78xx_reset(struct lan78xx_
|
||||
@@ -2691,6 +2691,11 @@ static int lan78xx_reset(struct lan78xx_
|
||||
int ret;
|
||||
u32 buf;
|
||||
u8 sig;
|
||||
|
@ -26,7 +26,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
|
||||
ret = lan78xx_read_reg(dev, HW_CFG, &buf);
|
||||
if (ret < 0)
|
||||
@@ -2797,6 +2802,10 @@ static int lan78xx_reset(struct lan78xx_
|
||||
@@ -2772,6 +2777,10 @@ static int lan78xx_reset(struct lan78xx_
|
||||
|
||||
buf |= HW_CFG_MEF_;
|
||||
|
||||
|
@ -37,7 +37,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
ret = lan78xx_write_reg(dev, HW_CFG, buf);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
@@ -2895,6 +2904,9 @@ static int lan78xx_reset(struct lan78xx_
|
||||
@@ -2870,6 +2879,9 @@ static int lan78xx_reset(struct lan78xx_
|
||||
buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
|
||||
--- a/drivers/net/usb/lan78xx.c
|
||||
+++ b/drivers/net/usb/lan78xx.c
|
||||
@@ -2960,6 +2960,22 @@ static int lan78xx_open(struct net_devic
|
||||
@@ -2935,6 +2935,22 @@ static int lan78xx_open(struct net_devic
|
||||
|
||||
netif_dbg(dev, ifup, dev->net, "phy initialised successfully");
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.org>
|
|||
static int lan78xx_read_reg(struct lan78xx_net *dev, u32 index, u32 *data)
|
||||
{
|
||||
u32 *buf;
|
||||
@@ -3288,8 +3297,14 @@ static int lan78xx_bind(struct lan78xx_n
|
||||
@@ -3263,8 +3272,14 @@ static int lan78xx_bind(struct lan78xx_n
|
||||
if (DEFAULT_RX_CSUM_ENABLE)
|
||||
dev->net->features |= NETIF_F_RXCSUM;
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
|
||||
--- a/drivers/net/usb/lan78xx.c
|
||||
+++ b/drivers/net/usb/lan78xx.c
|
||||
@@ -2277,6 +2277,22 @@ static int lan78xx_phy_init(struct lan78
|
||||
@@ -2252,6 +2252,22 @@ static int lan78xx_phy_init(struct lan78
|
||||
mii_adv_to_linkmode_adv_t(fc, mii_adv);
|
||||
linkmode_or(phydev->advertising, fc, phydev->advertising);
|
||||
|
||||
|
@ -39,7 +39,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
if (phydev->mdio.dev.of_node) {
|
||||
u32 reg;
|
||||
int len;
|
||||
@@ -2969,22 +2985,6 @@ static int lan78xx_open(struct net_devic
|
||||
@@ -2944,22 +2960,6 @@ static int lan78xx_open(struct net_devic
|
||||
|
||||
netif_dbg(dev, ifup, dev->net, "phy initialised successfully");
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ See: https://github.com/raspberrypi/linux/issues/2447
|
|||
static int lan78xx_read_reg(struct lan78xx_net *dev, u32 index, u32 *data)
|
||||
{
|
||||
u32 *buf;
|
||||
@@ -4148,7 +4153,13 @@ static int lan78xx_probe(struct usb_inte
|
||||
@@ -4123,7 +4128,13 @@ static int lan78xx_probe(struct usb_inte
|
||||
netdev->max_mtu = MAX_SINGLE_PACKET_SIZE;
|
||||
netif_set_gso_max_size(netdev, MAX_SINGLE_PACKET_SIZE - MAX_HEADER);
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.org>
|
|||
|
||||
--- a/drivers/net/usb/lan78xx.c
|
||||
+++ b/drivers/net/usb/lan78xx.c
|
||||
@@ -2282,7 +2282,7 @@ static int lan78xx_phy_init(struct lan78
|
||||
@@ -2257,7 +2257,7 @@ static int lan78xx_phy_init(struct lan78
|
||||
mii_adv_to_linkmode_adv_t(fc, mii_adv);
|
||||
linkmode_or(phydev->advertising, fc, phydev->advertising);
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ in both dwc_otg and in ipv6 processing.
|
|||
|
||||
--- a/drivers/net/usb/lan78xx.c
|
||||
+++ b/drivers/net/usb/lan78xx.c
|
||||
@@ -3527,7 +3527,7 @@ static int rx_submit(struct lan78xx_net
|
||||
@@ -3502,7 +3502,7 @@ static int rx_submit(struct lan78xx_net
|
||||
size_t size = dev->rx_urb_size;
|
||||
int ret = 0;
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ Signed-off-by: Naushir Patuck <naush@raspberrypi.com>
|
|||
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -17530,6 +17530,14 @@ T: git git://linuxtv.org/media_tree.git
|
||||
@@ -17538,6 +17538,14 @@ T: git git://linuxtv.org/media_tree.git
|
||||
F: Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml
|
||||
F: drivers/media/i2c/imx412.c
|
||||
|
||||
|
|
|
@ -132,7 +132,7 @@ Signed-off-by: David Plowman <david.plowman@raspberrypi.com>
|
|||
+...
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -17544,6 +17544,7 @@ M: Raspberry Pi Kernel Maintenance <kern
|
||||
@@ -17552,6 +17552,7 @@ M: Raspberry Pi Kernel Maintenance <kern
|
||||
L: linux-media@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://linuxtv.org/media_tree.git
|
||||
|
|
|
@ -21,7 +21,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>
|
|||
|
||||
--- a/drivers/net/phy/smsc.c
|
||||
+++ b/drivers/net/phy/smsc.c
|
||||
@@ -230,12 +230,12 @@ static int lan87xx_read_status(struct ph
|
||||
@@ -223,12 +223,12 @@ static int lan87xx_read_status(struct ph
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
|
|
|
@ -132,7 +132,7 @@ Signed-off-by: Lee Jackson <info@arducam.com>
|
|||
+...
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -17548,6 +17548,14 @@ F: Documentation/devicetree/bindings/med
|
||||
@@ -17556,6 +17556,14 @@ F: Documentation/devicetree/bindings/med
|
||||
F: Documentation/devicetree/bindings/media/i2c/imx477.yaml
|
||||
F: drivers/media/i2c/imx477.c
|
||||
|
||||
|
|
|
@ -18,8 +18,8 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>
|
|||
|
||||
--- a/drivers/net/phy/smsc.c
|
||||
+++ b/drivers/net/phy/smsc.c
|
||||
@@ -220,6 +220,8 @@ static int lan87xx_read_status(struct ph
|
||||
int err = genphy_read_status(phydev);
|
||||
@@ -213,6 +213,8 @@ static int lan87xx_read_status(struct ph
|
||||
return err;
|
||||
|
||||
if (!phydev->link && priv->energy_enable && phydev->irq == PHY_POLL) {
|
||||
+ int energy_detected;
|
||||
|
@ -27,7 +27,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>
|
|||
/* Disable EDPD to wake up PHY */
|
||||
int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
|
||||
if (rc < 0)
|
||||
@@ -235,7 +237,7 @@ static int lan87xx_read_status(struct ph
|
||||
@@ -228,7 +230,7 @@ static int lan87xx_read_status(struct ph
|
||||
*/
|
||||
read_poll_timeout(phy_read, rc,
|
||||
rc & MII_LAN83C185_ENERGYON || rc < 0,
|
||||
|
@ -36,7 +36,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>
|
|||
MII_LAN83C185_CTRL_STATUS);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
@@ -245,10 +247,16 @@ static int lan87xx_read_status(struct ph
|
||||
@@ -238,10 +240,16 @@ static int lan87xx_read_status(struct ph
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
CONFIG_ADM6996_PHY=y
|
||||
CONFIG_ARCH_BINFMT_ELF_STATE=y
|
||||
CONFIG_ARCH_CLOCKSOURCE_DATA=y
|
||||
CONFIG_ARCH_DISCARD_MEMBLOCK=y
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
CONFIG_ADM6996_PHY=y
|
||||
CONFIG_ARCH_32BIT_OFF_T=y
|
||||
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
|
||||
CONFIG_ARCH_KEEP_MEMBLOCK=y
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
CONFIG_ADM6996_PHY=y
|
||||
CONFIG_B44=y
|
||||
CONFIG_B44_PCI=y
|
||||
CONFIG_B44_PCICORE_AUTOSELECT=y
|
||||
CONFIG_B44_PCI_AUTOSELECT=y
|
||||
# CONFIG_BCM47XX_BCMA is not set
|
||||
# CONFIG_BCMA is not set
|
||||
# CONFIG_MTD_NAND is not set
|
||||
# CONFIG_SSB_DRIVER_GIGE is not set
|
||||
|
|
|
@ -1,20 +1,16 @@
|
|||
# CONFIG_ADM6996_PHY is not set
|
||||
# CONFIG_BCM47XX_SSB is not set
|
||||
CONFIG_BGMAC=y
|
||||
CONFIG_BGMAC_BCMA=y
|
||||
CONFIG_BOUNCE=y
|
||||
CONFIG_CPU_HAS_DIEI=y
|
||||
CONFIG_CPU_HAS_RIXI=y
|
||||
# CONFIG_CPU_MIPS32_R1 is not set
|
||||
# CONFIG_CPU_MIPSR1 is not set
|
||||
CONFIG_CPU_MIPS32_R2=y
|
||||
CONFIG_CPU_MIPSR2=y
|
||||
# CONFIG_FIXED_PHY is not set
|
||||
CONFIG_CPU_SUPPORTS_MSA=y
|
||||
# CONFIG_GPIO_WDT is not set
|
||||
CONFIG_HIGHMEM=y
|
||||
CONFIG_KMAP_LOCAL=y
|
||||
CONFIG_MIPS_SPRAM=y
|
||||
# CONFIG_SSB is not set
|
||||
# CONFIG_SSB_DRIVER_EXTIF is not set
|
||||
# CONFIG_SSB_DRIVER_GIGE is not set
|
||||
# CONFIG_SSB_DRIVER_MIPS is not set
|
||||
# CONFIG_SSB_EMBEDDED is not set
|
||||
# CONFIG_SSB_PCICORE_HOSTMODE is not set
|
||||
# CONFIG_SSB_SERIAL is not set
|
||||
# CONFIG_SSB_SFLASH is not set
|
||||
CONFIG_TARGET_ISA_REV=2
|
||||
|
|
|
@ -1,76 +0,0 @@
|
|||
From 327dabbd0111910a7d174b0b812d608d6b67bead Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Mon, 8 Aug 2022 23:05:25 +0200
|
||||
Subject: [PATCH] bgmac: fix *initial* chip reset to support BCM5358
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
While bringing hardware up we should perform a full reset including the
|
||||
switch bit (BGMAC_BCMA_IOCTL_SW_RESET aka SICF_SWRST). It's what
|
||||
specification says and what reference driver does.
|
||||
|
||||
This seems to be critical for the BCM5358. Without this hardware doesn't
|
||||
get initialized properly and doesn't seem to transmit or receive any
|
||||
packets.
|
||||
|
||||
Originally bgmac was calling bgmac_chip_reset() before setting
|
||||
"has_robosw" property which resulted in expected behaviour. That has
|
||||
changed as a side effect of adding platform device support which
|
||||
regressed BCM5358 support.
|
||||
|
||||
Fixes: f6a95a24957a ("net: ethernet: bgmac: Add platform device support")
|
||||
Cc: Jon Mason <jdmason@kudzu.us>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
---
|
||||
drivers/net/ethernet/broadcom/bgmac.c | 8 ++++++--
|
||||
drivers/net/ethernet/broadcom/bgmac.h | 2 ++
|
||||
2 files changed, 8 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/net/ethernet/broadcom/bgmac.c
|
||||
+++ b/drivers/net/ethernet/broadcom/bgmac.c
|
||||
@@ -891,13 +891,13 @@ static void bgmac_chip_reset_idm_config(
|
||||
|
||||
if (iost & BGMAC_BCMA_IOST_ATTACHED) {
|
||||
flags = BGMAC_BCMA_IOCTL_SW_CLKEN;
|
||||
- if (!bgmac->has_robosw)
|
||||
+ if (bgmac->in_init || !bgmac->has_robosw)
|
||||
flags |= BGMAC_BCMA_IOCTL_SW_RESET;
|
||||
}
|
||||
bgmac_clk_enable(bgmac, flags);
|
||||
}
|
||||
|
||||
- if (iost & BGMAC_BCMA_IOST_ATTACHED && !bgmac->has_robosw)
|
||||
+ if (iost & BGMAC_BCMA_IOST_ATTACHED && (bgmac->in_init || !bgmac->has_robosw))
|
||||
bgmac_idm_write(bgmac, BCMA_IOCTL,
|
||||
bgmac_idm_read(bgmac, BCMA_IOCTL) &
|
||||
~BGMAC_BCMA_IOCTL_SW_RESET);
|
||||
@@ -1502,6 +1502,8 @@ int bgmac_enet_probe(struct bgmac *bgmac
|
||||
struct net_device *net_dev = bgmac->net_dev;
|
||||
int err;
|
||||
|
||||
+ bgmac->in_init = true;
|
||||
+
|
||||
bgmac_chip_intrs_off(bgmac);
|
||||
|
||||
net_dev->irq = bgmac->irq;
|
||||
@@ -1562,6 +1564,8 @@ int bgmac_enet_probe(struct bgmac *bgmac
|
||||
bgmac->b53_device = &bgmac_b53_dev;
|
||||
}
|
||||
|
||||
+ bgmac->in_init = false;
|
||||
+
|
||||
err = register_netdev(bgmac->net_dev);
|
||||
if (err) {
|
||||
dev_err(bgmac->dev, "Cannot register net device\n");
|
||||
--- a/drivers/net/ethernet/broadcom/bgmac.h
|
||||
+++ b/drivers/net/ethernet/broadcom/bgmac.h
|
||||
@@ -475,6 +475,8 @@ struct bgmac {
|
||||
int irq;
|
||||
u32 int_mask;
|
||||
|
||||
+ bool in_init;
|
||||
+
|
||||
/* Current MAC state */
|
||||
int mac_speed;
|
||||
int mac_duplex;
|
|
@ -47,9 +47,7 @@ platform_do_upgrade() {
|
|||
case "$(board_name)" in
|
||||
comtrend,vr-3032u|\
|
||||
huawei,hg253s-v2|\
|
||||
netgear,dgnd3700-v2|\
|
||||
sercomm,h500-s-lowi|\
|
||||
sercomm,h500-s-vfes)
|
||||
netgear,dgnd3700-v2)
|
||||
cfe_jffs2_nand_upgrade "$1"
|
||||
;;
|
||||
*)
|
||||
|
|
|
@ -30,7 +30,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -354,6 +354,7 @@ struct napi_struct {
|
||||
@@ -356,6 +356,7 @@ struct napi_struct {
|
||||
struct list_head dev_list;
|
||||
struct hlist_node napi_hash_node;
|
||||
unsigned int napi_id;
|
||||
|
@ -38,7 +38,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
enum {
|
||||
@@ -364,6 +365,7 @@ enum {
|
||||
@@ -366,6 +367,7 @@ enum {
|
||||
NAPI_STATE_LISTED, /* NAPI added to system lists */
|
||||
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
||||
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
||||
|
@ -46,7 +46,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
enum {
|
||||
@@ -374,6 +376,7 @@ enum {
|
||||
@@ -376,6 +378,7 @@ enum {
|
||||
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
|
||||
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
||||
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
||||
|
@ -54,7 +54,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
enum gro_result {
|
||||
@@ -504,20 +507,7 @@ static inline bool napi_complete(struct
|
||||
@@ -506,20 +509,7 @@ static inline bool napi_complete(struct
|
||||
*/
|
||||
void napi_disable(struct napi_struct *n);
|
||||
|
||||
|
@ -76,7 +76,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
|
||||
/**
|
||||
* napi_synchronize - wait until NAPI is not running
|
||||
@@ -1863,6 +1853,8 @@ enum netdev_ml_priv_type {
|
||||
@@ -1865,6 +1855,8 @@ enum netdev_ml_priv_type {
|
||||
*
|
||||
* @wol_enabled: Wake-on-LAN is enabled
|
||||
*
|
||||
|
@ -85,7 +85,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
* @net_notifier_list: List of per-net netdev notifier block
|
||||
* that follow this device when it is moved
|
||||
* to another network namespace.
|
||||
@@ -2182,6 +2174,7 @@ struct net_device {
|
||||
@@ -2184,6 +2176,7 @@ struct net_device {
|
||||
struct lock_class_key *qdisc_running_key;
|
||||
bool proto_down;
|
||||
unsigned wol_enabled:1;
|
||||
|
|
|
@ -46,7 +46,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
+ == ==================================
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -498,6 +498,8 @@ static inline bool napi_complete(struct
|
||||
@@ -500,6 +500,8 @@ static inline bool napi_complete(struct
|
||||
return napi_complete_done(n, 0);
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -366,6 +366,7 @@ enum {
|
||||
@@ -368,6 +368,7 @@ enum {
|
||||
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
|
||||
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
|
||||
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
|
||||
|
@ -35,7 +35,7 @@ Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
|
|||
};
|
||||
|
||||
enum {
|
||||
@@ -377,6 +378,7 @@ enum {
|
||||
@@ -379,6 +380,7 @@ enum {
|
||||
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
|
||||
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
|
||||
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
|
||||
|
|
|
@ -58,7 +58,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -848,6 +848,27 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||
@@ -850,6 +850,27 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||
struct sk_buff *skb,
|
||||
struct net_device *sb_dev);
|
||||
|
||||
|
@ -86,7 +86,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
enum tc_setup_type {
|
||||
TC_SETUP_QDISC_MQPRIO,
|
||||
TC_SETUP_CLSU32,
|
||||
@@ -1294,6 +1315,8 @@ struct netdev_net_notifier {
|
||||
@@ -1296,6 +1317,8 @@ struct netdev_net_notifier {
|
||||
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
|
||||
* If a device is paired with a peer device, return the peer instance.
|
||||
* The caller must be under RCU read context.
|
||||
|
@ -95,7 +95,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
*/
|
||||
struct net_device_ops {
|
||||
int (*ndo_init)(struct net_device *dev);
|
||||
@@ -1502,6 +1525,8 @@ struct net_device_ops {
|
||||
@@ -1504,6 +1527,8 @@ struct net_device_ops {
|
||||
int (*ndo_tunnel_ctl)(struct net_device *dev,
|
||||
struct ip_tunnel_parm *p, int cmd);
|
||||
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
|
||||
|
@ -104,7 +104,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
};
|
||||
|
||||
/**
|
||||
@@ -2849,6 +2874,8 @@ void dev_remove_offload(struct packet_of
|
||||
@@ -2851,6 +2876,8 @@ void dev_remove_offload(struct packet_of
|
||||
|
||||
int dev_get_iflink(const struct net_device *dev);
|
||||
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
|
||||
|
|
|
@ -28,7 +28,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -850,11 +850,18 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||
@@ -852,11 +852,18 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||
|
||||
enum net_device_path_type {
|
||||
DEV_PATH_ETHERNET = 0,
|
||||
|
|
|
@ -9,7 +9,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -851,6 +851,7 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||
@@ -853,6 +853,7 @@ typedef u16 (*select_queue_fallback_t)(s
|
||||
enum net_device_path_type {
|
||||
DEV_PATH_ETHERNET = 0,
|
||||
DEV_PATH_VLAN,
|
||||
|
|
|
@ -15,7 +15,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -862,10 +862,20 @@ struct net_device_path {
|
||||
@@ -864,10 +864,20 @@ struct net_device_path {
|
||||
u16 id;
|
||||
__be16 proto;
|
||||
} encap;
|
||||
|
@ -36,7 +36,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
struct net_device_path_stack {
|
||||
int num_paths;
|
||||
@@ -875,6 +885,12 @@ struct net_device_path_stack {
|
||||
@@ -877,6 +887,12 @@ struct net_device_path_stack {
|
||||
struct net_device_path_ctx {
|
||||
const struct net_device *dev;
|
||||
const u8 *daddr;
|
||||
|
|
|
@ -83,7 +83,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -852,6 +852,7 @@ enum net_device_path_type {
|
||||
@@ -854,6 +854,7 @@ enum net_device_path_type {
|
||||
DEV_PATH_ETHERNET = 0,
|
||||
DEV_PATH_VLAN,
|
||||
DEV_PATH_BRIDGE,
|
||||
|
@ -91,7 +91,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
};
|
||||
|
||||
struct net_device_path {
|
||||
@@ -861,6 +862,7 @@ struct net_device_path {
|
||||
@@ -863,6 +864,7 @@ struct net_device_path {
|
||||
struct {
|
||||
u16 id;
|
||||
__be16 proto;
|
||||
|
|
|
@ -10,7 +10,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -853,6 +853,7 @@ enum net_device_path_type {
|
||||
@@ -855,6 +855,7 @@ enum net_device_path_type {
|
||||
DEV_PATH_VLAN,
|
||||
DEV_PATH_BRIDGE,
|
||||
DEV_PATH_PPPOE,
|
||||
|
@ -18,7 +18,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
};
|
||||
|
||||
struct net_device_path {
|
||||
@@ -873,6 +874,10 @@ struct net_device_path {
|
||||
@@ -875,6 +876,10 @@ struct net_device_path {
|
||||
u16 vlan_id;
|
||||
__be16 vlan_proto;
|
||||
} bridge;
|
||||
|
|
|
@ -12,7 +12,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
|
|||
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -870,6 +870,7 @@ struct net_device_path {
|
||||
@@ -872,6 +872,7 @@ struct net_device_path {
|
||||
DEV_PATH_BR_VLAN_KEEP,
|
||||
DEV_PATH_BR_VLAN_TAG,
|
||||
DEV_PATH_BR_VLAN_UNTAG,
|
||||
|
|
|
@ -18,7 +18,7 @@ Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit
|
|||
|
||||
--- a/net/ipv4/fib_frontend.c
|
||||
+++ b/net/ipv4/fib_frontend.c
|
||||
@@ -1129,10 +1129,8 @@ void fib_add_ifaddr(struct in_ifaddr *if
|
||||
@@ -1132,10 +1132,8 @@ void fib_add_ifaddr(struct in_ifaddr *if
|
||||
prefix, ifa->ifa_prefixlen, prim,
|
||||
ifa->ifa_rt_priority);
|
||||
|
||||
|
|
|
@ -1,104 +1,360 @@
|
|||
From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001
|
||||
From a4103262b01a1b8704b37c01c7c813df91b7b119 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 4 Aug 2021 01:31:34 -0600
|
||||
Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
Date: Sun, 18 Sep 2022 01:59:58 -0600
|
||||
Subject: [PATCH 01/29] mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Some architectures automatically set the accessed bit in PTEs, e.g.,
|
||||
x86 and arm64 v8.2. On architectures that do not have this capability,
|
||||
clearing the accessed bit in a PTE triggers a page fault following the
|
||||
TLB miss of this PTE.
|
||||
Patch series "Multi-Gen LRU Framework", v14.
|
||||
|
||||
Being aware of this capability can help make better decisions, i.e.,
|
||||
whether to limit the size of each batch of PTEs and the burst of
|
||||
batches when clearing the accessed bit.
|
||||
What's new
|
||||
==========
|
||||
1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
|
||||
Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
|
||||
2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
|
||||
machines. The old direct reclaim backoff, which tries to enforce a
|
||||
minimum fairness among all eligible memcgs, over-swapped by about
|
||||
(total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
|
||||
pulls the plug on swapping once the target is met, trades some
|
||||
fairness for curtailed latency:
|
||||
https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/
|
||||
3. Fixed minior build warnings and conflicts. More comments and nits.
|
||||
|
||||
TLDR
|
||||
====
|
||||
The current page reclaim is too expensive in terms of CPU usage and it
|
||||
often makes poor choices about what to evict. This patchset offers an
|
||||
alternative solution that is performant, versatile and
|
||||
straightforward.
|
||||
|
||||
Patchset overview
|
||||
=================
|
||||
The design and implementation overview is in patch 14:
|
||||
https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/
|
||||
|
||||
01. mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
Take advantage of hardware features when trying to clear the accessed
|
||||
bit in many PTEs.
|
||||
|
||||
03. mm/vmscan.c: refactor shrink_node()
|
||||
04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
|
||||
its sole caller"
|
||||
Minor refactors to improve readability for the following patches.
|
||||
|
||||
05. mm: multi-gen LRU: groundwork
|
||||
Adds the basic data structure and the functions that insert pages to
|
||||
and remove pages from the multi-gen LRU (MGLRU) lists.
|
||||
|
||||
06. mm: multi-gen LRU: minimal implementation
|
||||
A minimal implementation without optimizations.
|
||||
|
||||
07. mm: multi-gen LRU: exploit locality in rmap
|
||||
Exploits spatial locality to improve efficiency when using the rmap.
|
||||
|
||||
08. mm: multi-gen LRU: support page table walks
|
||||
Further exploits spatial locality by optionally scanning page tables.
|
||||
|
||||
09. mm: multi-gen LRU: optimize multiple memcgs
|
||||
Optimizes the overall performance for multiple memcgs running mixed
|
||||
types of workloads.
|
||||
|
||||
10. mm: multi-gen LRU: kill switch
|
||||
Adds a kill switch to enable or disable MGLRU at runtime.
|
||||
|
||||
11. mm: multi-gen LRU: thrashing prevention
|
||||
12. mm: multi-gen LRU: debugfs interface
|
||||
Provide userspace with features like thrashing prevention, working set
|
||||
estimation and proactive reclaim.
|
||||
|
||||
13. mm: multi-gen LRU: admin guide
|
||||
14. mm: multi-gen LRU: design doc
|
||||
Add an admin guide and a design doc.
|
||||
|
||||
Benchmark results
|
||||
=================
|
||||
Independent lab results
|
||||
-----------------------
|
||||
Based on the popularity of searches [01] and the memory usage in
|
||||
Google's public cloud, the most popular open-source memory-hungry
|
||||
applications, in alphabetical order, are:
|
||||
Apache Cassandra Memcached
|
||||
Apache Hadoop MongoDB
|
||||
Apache Spark PostgreSQL
|
||||
MariaDB (MySQL) Redis
|
||||
|
||||
An independent lab evaluated MGLRU with the most widely used benchmark
|
||||
suites for the above applications. They posted 960 data points along
|
||||
with kernel metrics and perf profiles collected over more than 500
|
||||
hours of total benchmark time. Their final reports show that, with 95%
|
||||
confidence intervals (CIs), the above applications all performed
|
||||
significantly better for at least part of their benchmark matrices.
|
||||
|
||||
On 5.14:
|
||||
1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
|
||||
less wall time to sort three billion random integers, respectively,
|
||||
under the medium- and the high-concurrency conditions, when
|
||||
overcommitting memory. There were no statistically significant
|
||||
changes in wall time for the rest of the benchmark matrix.
|
||||
2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
|
||||
more transactions per minute (TPM), respectively, under the medium-
|
||||
and the high-concurrency conditions, when overcommitting memory.
|
||||
There were no statistically significant changes in TPM for the rest
|
||||
of the benchmark matrix.
|
||||
3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
|
||||
and [21.59, 30.02]% more operations per second (OPS), respectively,
|
||||
for sequential access, random access and Gaussian (distribution)
|
||||
access, when THP=always; 95% CIs [13.85, 15.97]% and
|
||||
[23.94, 29.92]% more OPS, respectively, for random access and
|
||||
Gaussian access, when THP=never. There were no statistically
|
||||
significant changes in OPS for the rest of the benchmark matrix.
|
||||
4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
|
||||
[2.16, 3.55]% more operations per second (OPS), respectively, for
|
||||
exponential (distribution) access, random access and Zipfian
|
||||
(distribution) access, when underutilizing memory; 95% CIs
|
||||
[8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
|
||||
respectively, for exponential access, random access and Zipfian
|
||||
access, when overcommitting memory.
|
||||
|
||||
On 5.15:
|
||||
5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
|
||||
and [4.11, 7.50]% more operations per second (OPS), respectively,
|
||||
for exponential (distribution) access, random access and Zipfian
|
||||
(distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
|
||||
[6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
|
||||
exponential access, random access and Zipfian access, when swap was
|
||||
on.
|
||||
6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
|
||||
less average wall time to finish twelve parallel TeraSort jobs,
|
||||
respectively, under the medium- and the high-concurrency
|
||||
conditions, when swap was on. There were no statistically
|
||||
significant changes in average wall time for the rest of the
|
||||
benchmark matrix.
|
||||
7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
|
||||
minute (TPM) under the high-concurrency condition, when swap was
|
||||
off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
|
||||
respectively, under the medium- and the high-concurrency
|
||||
conditions, when swap was on. There were no statistically
|
||||
significant changes in TPM for the rest of the benchmark matrix.
|
||||
8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
|
||||
[11.47, 19.36]% more total operations per second (OPS),
|
||||
respectively, for sequential access, random access and Gaussian
|
||||
(distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
|
||||
[10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
|
||||
for sequential access, random access and Gaussian access, when
|
||||
THP=never.
|
||||
|
||||
Our lab results
|
||||
---------------
|
||||
To supplement the above results, we ran the following benchmark suites
|
||||
on 5.16-rc7 and found no regressions [10].
|
||||
fs_fio_bench_hdd_mq pft
|
||||
fs_lmbench pgsql-hammerdb
|
||||
fs_parallelio redis
|
||||
fs_postmark stream
|
||||
hackbench sysbenchthread
|
||||
kernbench tpcc_spark
|
||||
memcached unixbench
|
||||
multichase vm-scalability
|
||||
mutilate will-it-scale
|
||||
nginx
|
||||
|
||||
[01] https://trends.google.com
|
||||
[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/
|
||||
[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/
|
||||
[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/
|
||||
[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/
|
||||
[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/
|
||||
[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/
|
||||
[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/
|
||||
[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/
|
||||
[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/
|
||||
|
||||
Read-world applications
|
||||
=======================
|
||||
Third-party testimonials
|
||||
------------------------
|
||||
Konstantin reported [11]:
|
||||
I have Archlinux with 8G RAM + zswap + swap. While developing, I
|
||||
have lots of apps opened such as multiple LSP-servers for different
|
||||
langs, chats, two browsers, etc... Usually, my system gets quickly
|
||||
to a point of SWAP-storms, where I have to kill LSP-servers,
|
||||
restart browsers to free memory, etc, otherwise the system lags
|
||||
heavily and is barely usable.
|
||||
|
||||
1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
|
||||
patchset, and I started up by opening lots of apps to create memory
|
||||
pressure, and worked for a day like this. Till now I had not a
|
||||
single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
|
||||
getting to the point of 3G in SWAP before without a single
|
||||
SWAP-storm.
|
||||
|
||||
Vaibhav from IBM reported [12]:
|
||||
In a synthetic MongoDB Benchmark, seeing an average of ~19%
|
||||
throughput improvement on POWER10(Radix MMU + 64K Page Size) with
|
||||
MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
|
||||
three different request distributions, namely, Exponential, Uniform
|
||||
and Zipfan.
|
||||
|
||||
Shuang from U of Rochester reported [13]:
|
||||
With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
|
||||
and [9.26, 10.36]% higher throughput, respectively, for random
|
||||
access, Zipfian (distribution) access and Gaussian (distribution)
|
||||
access, when the average number of jobs per CPU is 1; 95% CIs
|
||||
[42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
|
||||
throughput, respectively, for random access, Zipfian access and
|
||||
Gaussian access, when the average number of jobs per CPU is 2.
|
||||
|
||||
Daniel from Michigan Tech reported [14]:
|
||||
With Memcached allocating ~100GB of byte-addressable Optante,
|
||||
performance improvement in terms of throughput (measured as queries
|
||||
per second) was about 10% for a series of workloads.
|
||||
|
||||
Large-scale deployments
|
||||
-----------------------
|
||||
We've rolled out MGLRU to tens of millions of ChromeOS users and
|
||||
about a million Android users. Google's fleetwide profiling [15] shows
|
||||
an overall 40% decrease in kswapd CPU usage, in addition to
|
||||
improvements in other UX metrics, e.g., an 85% decrease in the number
|
||||
of low-memory kills at the 75th percentile and an 18% decrease in
|
||||
app launch time at the 50th percentile.
|
||||
|
||||
The downstream kernels that have been using MGLRU include:
|
||||
1. Android [16]
|
||||
2. Arch Linux Zen [17]
|
||||
3. Armbian [18]
|
||||
4. ChromeOS [19]
|
||||
5. Liquorix [20]
|
||||
6. OpenWrt [21]
|
||||
7. post-factum [22]
|
||||
8. XanMod [23]
|
||||
|
||||
[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/
|
||||
[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/
|
||||
[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/
|
||||
[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
|
||||
[15] https://dl.acm.org/doi/10.1145/2749469.2750392
|
||||
[16] https://android.com
|
||||
[17] https://archlinux.org
|
||||
[18] https://armbian.com
|
||||
[19] https://chromium.org
|
||||
[20] https://liquorix.net
|
||||
[21] https://openwrt.org
|
||||
[22] https://codeberg.org/pf-kernel
|
||||
[23] https://xanmod.org
|
||||
|
||||
Summary
|
||||
=======
|
||||
The facts are:
|
||||
1. The independent lab results and the real-world applications
|
||||
indicate substantial improvements; there are no known regressions.
|
||||
2. Thrashing prevention, working set estimation and proactive reclaim
|
||||
work out of the box; there are no equivalent solutions.
|
||||
3. There is a lot of new code; no smaller changes have been
|
||||
demonstrated similar effects.
|
||||
|
||||
Our options, accordingly, are:
|
||||
1. Given the amount of evidence, the reported improvements will likely
|
||||
materialize for a wide range of workloads.
|
||||
2. Gauging the interest from the past discussions, the new features
|
||||
will likely be put to use for both personal computers and data
|
||||
centers.
|
||||
3. Based on Google's track record, the new code will likely be well
|
||||
maintained in the long term. It'd be more difficult if not
|
||||
impossible to achieve similar effects with other approaches.
|
||||
|
||||
This patch (of 14):
|
||||
|
||||
Some architectures automatically set the accessed bit in PTEs, e.g., x86
|
||||
and arm64 v8.2. On architectures that do not have this capability,
|
||||
clearing the accessed bit in a PTE usually triggers a page fault following
|
||||
the TLB miss of this PTE (to emulate the accessed bit).
|
||||
|
||||
Being aware of this capability can help make better decisions, e.g.,
|
||||
whether to spread the work out over a period of time to reduce bursty page
|
||||
faults when trying to clear the accessed bit in many PTEs.
|
||||
|
||||
Note that theoretically this capability can be unreliable, e.g.,
|
||||
hotplugged CPUs might be different from builtin ones. Therefore it should
|
||||
not be used in architecture-independent code that involves correctness,
|
||||
e.g., to determine whether TLB flushes are required (in combination with
|
||||
the accessed bit).
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Acked-by: Will Deacon <will@kernel.org>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: linux-arm-kernel@lists.infradead.org
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/arm64/include/asm/cpufeature.h | 5 +++++
|
||||
arch/arm64/include/asm/pgtable.h | 13 ++++++++-----
|
||||
arch/arm64/kernel/cpufeature.c | 10 ++++++++++
|
||||
arch/arm64/tools/cpucaps | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 6 +++---
|
||||
include/linux/pgtable.h | 13 +++++++++++++
|
||||
mm/memory.c | 14 +-------------
|
||||
7 files changed, 41 insertions(+), 21 deletions(-)
|
||||
arch/arm64/include/asm/pgtable.h | 14 ++------------
|
||||
arch/x86/include/asm/pgtable.h | 6 +++---
|
||||
include/linux/pgtable.h | 13 +++++++++++++
|
||||
mm/memory.c | 14 +-------------
|
||||
4 files changed, 19 insertions(+), 28 deletions(-)
|
||||
|
||||
--- a/arch/arm64/include/asm/cpufeature.h
|
||||
+++ b/arch/arm64/include/asm/cpufeature.h
|
||||
@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r
|
||||
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
|
||||
}
|
||||
|
||||
+static inline bool system_has_hw_af(void)
|
||||
+{
|
||||
+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF);
|
||||
+}
|
||||
+
|
||||
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
|
||||
|
||||
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
|
||||
--- a/arch/arm64/include/asm/pgtable.h
|
||||
+++ b/arch/arm64/include/asm/pgtable.h
|
||||
@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru
|
||||
@@ -999,23 +999,13 @@ static inline void update_mmu_cache(stru
|
||||
* page after fork() + CoW for pfn mappings. We don't always have a
|
||||
* hardware-managed access flag on arm64.
|
||||
*/
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
+static inline bool arch_has_hw_pte_young(bool local)
|
||||
{
|
||||
-{
|
||||
- WARN_ON(preemptible());
|
||||
+ if (local) {
|
||||
+ WARN_ON(preemptible());
|
||||
+ return cpu_has_hw_af();
|
||||
+ }
|
||||
|
||||
-
|
||||
- return !cpu_has_hw_af();
|
||||
+ return system_has_hw_af();
|
||||
}
|
||||
-}
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||
+#define arch_has_hw_pte_young cpu_has_hw_af
|
||||
|
||||
/*
|
||||
* Experimentally, it's cheap to set the access flag in hardware and we
|
||||
@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt
|
||||
* benefit from prefaulting mappings as 'old' to start with.
|
||||
*/
|
||||
static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
{
|
||||
-static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
-{
|
||||
- return !arch_faults_on_old_pte();
|
||||
+ return arch_has_hw_pte_young(true);
|
||||
}
|
||||
#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
||||
-}
|
||||
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
||||
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
--- a/arch/arm64/kernel/cpufeature.c
|
||||
+++ b/arch/arm64/kernel/cpufeature.c
|
||||
@@ -2197,6 +2197,16 @@ static const struct arm64_cpu_capabiliti
|
||||
.matches = has_hw_dbm,
|
||||
.cpu_enable = cpu_enable_hw_dbm,
|
||||
},
|
||||
+ {
|
||||
+ .desc = "Hardware update of the Access flag",
|
||||
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
|
||||
+ .capability = ARM64_HW_AF,
|
||||
+ .sys_reg = SYS_ID_AA64MMFR1_EL1,
|
||||
+ .sign = FTR_UNSIGNED,
|
||||
+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT,
|
||||
+ .min_field_value = 1,
|
||||
+ .matches = has_cpuid_feature,
|
||||
+ },
|
||||
#endif
|
||||
{
|
||||
.desc = "CRC32 instructions",
|
||||
--- a/arch/arm64/tools/cpucaps
|
||||
+++ b/arch/arm64/tools/cpucaps
|
||||
@@ -35,6 +35,7 @@ HAS_STAGE2_FWB
|
||||
HAS_SYSREG_GIC_CPUIF
|
||||
HAS_TLB_RANGE
|
||||
HAS_VIRT_HOST_EXTN
|
||||
+HW_AF
|
||||
HW_DBM
|
||||
KVM_PROTECTED_MODE
|
||||
MISMATCHED_CACHE_TYPE
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
|
||||
|
@ -108,7 +364,7 @@ Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
|
|||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||
+static inline bool arch_has_hw_pte_young(bool local)
|
||||
+static inline bool arch_has_hw_pte_young(void)
|
||||
{
|
||||
- return false;
|
||||
+ return true;
|
||||
|
@ -123,12 +379,12 @@ Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
|
|||
|
||||
+#ifndef arch_has_hw_pte_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit is supported by the local CPU or all CPUs.
|
||||
+ * Return whether the accessed bit is supported on the local CPU.
|
||||
+ *
|
||||
+ * Those arches which have hw access flag feature need to implement their own
|
||||
+ * helper. By default, "false" means pagefault will be hit on old pte.
|
||||
+ * This stub assumes accessing through an old PTE triggers a page fault.
|
||||
+ * Architectures that automatically set the access bit should overwrite it.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_pte_young(bool local)
|
||||
+static inline bool arch_has_hw_pte_young(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
|
@ -163,7 +419,7 @@ Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
|
|||
* take a double page fault, so mark it accessed here.
|
||||
*/
|
||||
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
|
||||
+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) {
|
||||
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
|
||||
pte_t entry;
|
||||
|
||||
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
|
||||
|
|
|
@ -1,47 +1,89 @@
|
|||
From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001
|
||||
From 493de1c4b0f2cd909169401da8c445f6c8a7e29d Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sat, 26 Sep 2020 21:17:18 -0600
|
||||
Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
Date: Sun, 18 Sep 2022 01:59:59 -0600
|
||||
Subject: [PATCH 02/29] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Some architectures support the accessed bit on non-leaf PMD entries,
|
||||
e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using
|
||||
it as part of linear address translation [1]. As an optimization, page
|
||||
table walkers who are interested in the accessed bit can skip the PTEs
|
||||
under a non-leaf PMD entry if the accessed bit is cleared on this PMD
|
||||
entry.
|
||||
Some architectures support the accessed bit in non-leaf PMD entries, e.g.,
|
||||
x86 sets the accessed bit in a non-leaf PMD entry when using it as part of
|
||||
linear address translation [1]. Page table walkers that clear the
|
||||
accessed bit may use this capability to reduce their search space.
|
||||
|
||||
Although an inline function may be preferable, this capability is
|
||||
added as a configuration option to look consistent when used with the
|
||||
existing macros.
|
||||
Note that:
|
||||
1. Although an inline function is preferable, this capability is added
|
||||
as a configuration option for consistency with the existing macros.
|
||||
2. Due to the little interest in other varieties, this capability was
|
||||
only tested on Intel and AMD CPUs.
|
||||
|
||||
Thanks to the following developers for their efforts [2][3].
|
||||
Randy Dunlap <rdunlap@infradead.org>
|
||||
Stephen Rothwell <sfr@canb.auug.org.au>
|
||||
|
||||
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
|
||||
Volume 3 (June 2021), section 4.8
|
||||
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
|
||||
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/Kconfig | 9 +++++++++
|
||||
arch/Kconfig | 8 ++++++++
|
||||
arch/x86/Kconfig | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 3 ++-
|
||||
arch/x86/mm/pgtable.c | 5 ++++-
|
||||
include/linux/pgtable.h | 4 ++--
|
||||
5 files changed, 18 insertions(+), 4 deletions(-)
|
||||
5 files changed, 17 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/Kconfig
|
||||
+++ b/arch/Kconfig
|
||||
@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT
|
||||
@@ -1295,6 +1295,14 @@ config ARCH_HAS_ELFCORE_COMPAT
|
||||
config ARCH_HAS_PARANOID_L1D_FLUSH
|
||||
bool
|
||||
|
||||
+config ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
+ bool
|
||||
+ depends on PGTABLE_LEVELS > 2
|
||||
+ help
|
||||
+ Architectures that select this are able to set the accessed bit on
|
||||
+ non-leaf PMD entries in addition to leaf PTE entries where pages are
|
||||
+ mapped. For them, page table walkers that clear the accessed bit may
|
||||
+ stop at non-leaf PMD entries if they do not see the accessed bit.
|
||||
+ Architectures that select this option are capable of setting the
|
||||
+ accessed bit in non-leaf PMD entries when using them as part of linear
|
||||
+ address translations. Page table walkers that clear the accessed bit
|
||||
+ may use this capability to reduce their search space.
|
||||
+
|
||||
source "kernel/gcov/Kconfig"
|
||||
|
||||
|
@ -52,7 +94,7 @@ Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
|
|||
select ARCH_HAS_PMEM_API if X86_64
|
||||
select ARCH_HAS_PTE_DEVMAP if X86_64
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64
|
||||
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||
select ARCH_HAS_COPY_MC if X86_64
|
||||
select ARCH_HAS_SET_MEMORY
|
||||
|
|
|
@ -1,21 +1,58 @@
|
|||
From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001
|
||||
From 9e17efd11450d3d2069adaa3c58db9ac8ebd1c66 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 27 Sep 2020 20:49:08 -0600
|
||||
Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node()
|
||||
Date: Sun, 18 Sep 2022 02:00:00 -0600
|
||||
Subject: [PATCH 03/29] mm/vmscan.c: refactor shrink_node()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch refactors shrink_node(). This will make the upcoming
|
||||
changes to mm/vmscan.c more readable.
|
||||
This patch refactors shrink_node() to improve readability for the upcoming
|
||||
changes to mm/vmscan.c.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-4-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
|
||||
1 file changed, 98 insertions(+), 88 deletions(-)
|
||||
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
|
||||
1 file changed, 104 insertions(+), 94 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2497,6 +2497,103 @@ enum scan_balance {
|
||||
@@ -2497,6 +2497,109 @@ enum scan_balance {
|
||||
SCAN_FILE,
|
||||
};
|
||||
|
||||
|
@ -27,6 +64,12 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
|
|||
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
+
|
||||
+ /*
|
||||
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
+ * lruvec stats for heuristics.
|
||||
+ */
|
||||
+ mem_cgroup_flush_stats();
|
||||
+
|
||||
+ /*
|
||||
+ * Determine the scan balance between anon and file LRUs.
|
||||
+ */
|
||||
+ spin_lock_irq(&target_lruvec->lru_lock);
|
||||
|
@ -119,7 +162,7 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
|
|||
/*
|
||||
* Determine how aggressively the anon and file LRU lists should be
|
||||
* scanned. The relative value of each set of LRU lists is determined
|
||||
@@ -2965,7 +3062,6 @@ static void shrink_node(pg_data_t *pgdat
|
||||
@@ -2965,109 +3068,16 @@ static void shrink_node(pg_data_t *pgdat
|
||||
unsigned long nr_reclaimed, nr_scanned;
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
|
@ -127,7 +170,15 @@ Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
|
|||
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
@@ -2981,93 +3077,7 @@ again:
|
||||
again:
|
||||
- /*
|
||||
- * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
- * lruvec stats for heuristics.
|
||||
- */
|
||||
- mem_cgroup_flush_stats();
|
||||
-
|
||||
memset(&sc->nr, 0, sizeof(sc->nr));
|
||||
|
||||
nr_reclaimed = sc->nr_reclaimed;
|
||||
nr_scanned = sc->nr_scanned;
|
||||
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
From 03705be42114db7cc5bd6eb7bf7e8703c94d4880 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:01 -0600
|
||||
Subject: [PATCH 04/29] Revert "include/linux/mm_inline.h: fold
|
||||
__update_lru_size() into its sole caller"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch undoes the following refactor: commit 289ccba18af4
|
||||
("include/linux/mm_inline.h: fold __update_lru_size() into its sole
|
||||
caller")
|
||||
|
||||
The upcoming changes to include/linux/mm_inline.h will reuse
|
||||
__update_lru_size().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-5-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 9 ++++++++-
|
||||
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struc
|
||||
return !PageSwapBacked(page);
|
||||
}
|
||||
|
||||
-static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||
enum lru_list lru, enum zone_type zid,
|
||||
int nr_pages)
|
||||
{
|
||||
@@ -33,6 +33,13 @@ static __always_inline void update_lru_s
|
||||
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||
+}
|
||||
+
|
||||
+static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||
+ enum lru_list lru, enum zone_type zid,
|
||||
+ long nr_pages)
|
||||
+{
|
||||
+ __update_lru_size(lruvec, lru, zid, nr_pages);
|
||||
#ifdef CONFIG_MEMCG
|
||||
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
|
||||
#endif
|
|
@ -1,996 +0,0 @@
|
|||
From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 25 Jan 2021 21:12:33 -0700
|
||||
Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
|
||||
|
||||
For each lruvec, evictable pages are divided into multiple
|
||||
generations. The youngest generation number is stored in
|
||||
lrugen->max_seq for both anon and file types as they are aged on an
|
||||
equal footing. The oldest generation numbers are stored in
|
||||
lrugen->min_seq[] separately for anon and file types as clean file
|
||||
pages can be evicted regardless of swap constraints. These three
|
||||
variables are monotonically increasing. Generation numbers are
|
||||
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
|
||||
page->flags. The sliding window technique is used to prevent truncated
|
||||
generation numbers from overlapping. Each truncated generation number
|
||||
is an index to
|
||||
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
|
||||
|
||||
The framework comprises two conceptually independent components: the
|
||||
aging, which produces young generations, and the eviction, which
|
||||
consumes old generations. Both can be invoked independently from user
|
||||
space for the purpose of working set estimation and proactive reclaim.
|
||||
|
||||
The protection of hot pages and the selection of cold pages are based
|
||||
on page access types and patterns. There are two access types: one via
|
||||
page tables and the other via file descriptors. The protection of the
|
||||
former type is by design stronger because:
|
||||
1) The uncertainty in determining the access patterns of the former
|
||||
type is higher due to the coalesced nature of the accessed bit.
|
||||
2) The cost of evicting the former type is higher due to the TLB
|
||||
flushes required and the likelihood of involving I/O.
|
||||
3) The penalty of under-protecting the former type is higher because
|
||||
applications usually do not prepare themselves for major faults like
|
||||
they do for blocked I/O. For example, client applications commonly
|
||||
dedicate blocked I/O to separate threads to avoid UI janks that
|
||||
negatively affect user experience.
|
||||
|
||||
There are also two access patterns: one with temporal locality and the
|
||||
other without. The latter pattern, e.g., random and sequential, needs
|
||||
to be explicitly excluded to avoid weakening the protection of the
|
||||
former pattern. Generally the former type follows the former pattern
|
||||
unless MADV_SEQUENTIAL is specified and the latter type follows the
|
||||
latter pattern unless outlying refaults have been observed.
|
||||
|
||||
Upon faulting, a page is added to the youngest generation, which
|
||||
provides the strongest protection as the eviction will not consider
|
||||
this page before the aging has scanned it at least twice. The first
|
||||
scan clears the accessed bit set during the initial fault. And the
|
||||
second scan makes sure this page has not been used since the first
|
||||
scan. A page from any other generations is brought back to the
|
||||
youngest generation whenever the aging finds the accessed bit set on
|
||||
any of the PTEs mapping this page.
|
||||
|
||||
Unmapped pages are initially added to the oldest generation and then
|
||||
conditionally protected by tiers. This is done later [PATCH 07/10].
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
|
||||
---
|
||||
fs/fuse/dev.c | 3 +-
|
||||
include/linux/cgroup.h | 15 +-
|
||||
include/linux/mm.h | 36 ++++
|
||||
include/linux/mm_inline.h | 182 ++++++++++++++++++++
|
||||
include/linux/mmzone.h | 70 ++++++++
|
||||
include/linux/page-flags-layout.h | 19 ++-
|
||||
include/linux/page-flags.h | 4 +-
|
||||
include/linux/sched.h | 3 +
|
||||
kernel/bounds.c | 3 +
|
||||
kernel/cgroup/cgroup-internal.h | 1 -
|
||||
mm/huge_memory.c | 3 +-
|
||||
mm/memcontrol.c | 1 +
|
||||
mm/memory.c | 7 +
|
||||
mm/mm_init.c | 6 +-
|
||||
mm/page_alloc.c | 1 +
|
||||
mm/swap.c | 9 +-
|
||||
mm/swapfile.c | 2 +
|
||||
mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
|
||||
18 files changed, 618 insertions(+), 15 deletions(-)
|
||||
|
||||
--- a/fs/fuse/dev.c
|
||||
+++ b/fs/fuse/dev.c
|
||||
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
|
||||
1 << PG_active |
|
||||
1 << PG_workingset |
|
||||
1 << PG_reclaim |
|
||||
- 1 << PG_waiters))) {
|
||||
+ 1 << PG_waiters |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||
dump_page(page, "fuse: trying to steal weird page");
|
||||
return 1;
|
||||
}
|
||||
--- a/include/linux/cgroup.h
|
||||
+++ b/include/linux/cgroup.h
|
||||
@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
|
||||
css_put(&cgrp->self);
|
||||
}
|
||||
|
||||
+extern struct mutex cgroup_mutex;
|
||||
+
|
||||
+static inline void cgroup_lock(void)
|
||||
+{
|
||||
+ mutex_lock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
+static inline void cgroup_unlock(void)
|
||||
+{
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||
* @task: the task to obtain css_set for
|
||||
@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
|
||||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
@@ -708,6 +719,8 @@ struct cgroup;
|
||||
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
||||
static inline void css_get(struct cgroup_subsys_state *css) {}
|
||||
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||
+static inline void cgroup_lock(void) {}
|
||||
+static inline void cgroup_unlock(void) {}
|
||||
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||
struct task_struct *t) { return 0; }
|
||||
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
--- a/include/linux/mm.h
|
||||
+++ b/include/linux/mm.h
|
||||
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
|
||||
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
||||
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
||||
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
||||
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
||||
|
||||
/*
|
||||
* Define the bit shifts to access each section. For non-existent
|
||||
@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
|
||||
loff_t const holebegin, loff_t const holelen, int even_cows) { }
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static inline void task_enter_nonseq_fault(void)
|
||||
+{
|
||||
+ WARN_ON(current->in_nonseq_fault);
|
||||
+
|
||||
+ current->in_nonseq_fault = 1;
|
||||
+}
|
||||
+
|
||||
+static inline void task_exit_nonseq_fault(void)
|
||||
+{
|
||||
+ WARN_ON(!current->in_nonseq_fault);
|
||||
+
|
||||
+ current->in_nonseq_fault = 0;
|
||||
+}
|
||||
+
|
||||
+static inline bool task_in_nonseq_fault(void)
|
||||
+{
|
||||
+ return current->in_nonseq_fault;
|
||||
+}
|
||||
+#else
|
||||
+static inline void task_enter_nonseq_fault(void)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void task_exit_nonseq_fault(void)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline bool task_in_nonseq_fault(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static inline void unmap_shared_mapping_range(struct address_space *mapping,
|
||||
loff_t const holebegin, loff_t const holelen)
|
||||
{
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
|
||||
return lru;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
|
||||
+
|
||||
+ return static_branch_likely(&lru_gen_static_key);
|
||||
+#else
|
||||
+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
|
||||
+
|
||||
+ return static_branch_unlikely(&lru_gen_static_key);
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
|
||||
+static inline int lru_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+/* The youngest and the second youngest generations are counted as active. */
|
||||
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
+{
|
||||
+ unsigned long max_seq = lruvec->evictable.max_seq;
|
||||
+
|
||||
+ VM_BUG_ON(gen >= MAX_NR_GENS);
|
||||
+
|
||||
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
||||
+}
|
||||
+
|
||||
+/* Update the sizes of the multigenerational lru lists. */
|
||||
+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
|
||||
+ int old_gen, int new_gen)
|
||||
+{
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ int delta = thp_nr_pages(page);
|
||||
+ enum lru_list lru = type * LRU_FILE;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ lockdep_assert_held(&lruvec->lru_lock);
|
||||
+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
|
||||
+
|
||||
+ if (old_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
|
||||
+ lrugen->sizes[old_gen][type][zone] - delta);
|
||||
+ if (new_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
|
||||
+ lrugen->sizes[new_gen][type][zone] + delta);
|
||||
+
|
||||
+ if (old_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, new_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ update_lru_size(lruvec, lru, zone, delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (new_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, old_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
|
||||
+ update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
|
||||
+ }
|
||||
+
|
||||
+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
||||
+}
|
||||
+
|
||||
+/* Add a page to one of the multigenerational lru lists. Return true on success. */
|
||||
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long old_flags, new_flags;
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ if (PageUnevictable(page) || !lrugen->enabled[type])
|
||||
+ return false;
|
||||
+ /*
|
||||
+ * If a page shouldn't be considered for eviction, i.e., a page mapped
|
||||
+ * upon fault during which the accessed bit is set, add it to the
|
||||
+ * youngest generation.
|
||||
+ *
|
||||
+ * If a page can't be evicted immediately, i.e., an anon page not in
|
||||
+ * swap cache or a dirty page pending writeback, add it to the second
|
||||
+ * oldest generation.
|
||||
+ *
|
||||
+ * If a page could be evicted immediately, e.g., a clean page, add it to
|
||||
+ * the oldest generation.
|
||||
+ */
|
||||
+ if (PageActive(page))
|
||||
+ gen = lru_gen_from_seq(lrugen->max_seq);
|
||||
+ else if ((!type && !PageSwapCache(page)) ||
|
||||
+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
|
||||
+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
|
||||
+ else
|
||||
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
+
|
||||
+ do {
|
||||
+ new_flags = old_flags = READ_ONCE(page->flags);
|
||||
+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
|
||||
+
|
||||
+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
|
||||
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||||
+
|
||||
+ lru_gen_update_size(page, lruvec, -1, gen);
|
||||
+ /* for rotate_reclaimable_page() */
|
||||
+ if (reclaiming)
|
||||
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ else
|
||||
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+/* Delete a page from one of the multigenerational lru lists. Return true on success. */
|
||||
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long old_flags, new_flags;
|
||||
+
|
||||
+ do {
|
||||
+ new_flags = old_flags = READ_ONCE(page->flags);
|
||||
+ if (!(new_flags & LRU_GEN_MASK))
|
||||
+ return false;
|
||||
+
|
||||
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
+
|
||||
+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+
|
||||
+ new_flags &= ~LRU_GEN_MASK;
|
||||
+ /* for shrink_page_list() */
|
||||
+ if (reclaiming)
|
||||
+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
|
||||
+ else if (lru_gen_is_active(lruvec, gen))
|
||||
+ new_flags |= BIT(PG_active);
|
||||
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||||
+
|
||||
+ lru_gen_update_size(page, lruvec, gen, -1);
|
||||
+ list_del(&page->lru);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+#else
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static __always_inline void add_page_to_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(page, lruvec, false))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -93,6 +269,9 @@ static __always_inline void add_page_to_
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(page, lruvec, true))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -100,6 +279,9 @@ static __always_inline void add_page_to_
|
||||
static __always_inline void del_page_from_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
+ if (lru_gen_del_page(page, lruvec, false))
|
||||
+ return;
|
||||
+
|
||||
list_del(&page->lru);
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -294,6 +294,72 @@ enum lruvec_flags {
|
||||
*/
|
||||
};
|
||||
|
||||
+struct lruvec;
|
||||
+
|
||||
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/*
|
||||
+ * For each lruvec, evictable pages are divided into multiple generations. The
|
||||
+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
|
||||
+ * monotonically increasing. The sliding window technique is used to track at
|
||||
+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
|
||||
+ * window, AKA gen, indexes an array of per-type and per-zone lists for the
|
||||
+ * corresponding generation. The counter in page->flags stores gen+1 while a
|
||||
+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
|
||||
+ *
|
||||
+ * After a page is faulted in, the aging must check the accessed bit at least
|
||||
+ * twice before the eviction would consider it. The first check clears the
|
||||
+ * accessed bit set during the initial fault. The second check makes sure this
|
||||
+ * page hasn't been used since then.
|
||||
+ */
|
||||
+#define MIN_NR_GENS 2
|
||||
+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
||||
+
|
||||
+struct lrugen {
|
||||
+ /* the aging increments the max generation number */
|
||||
+ unsigned long max_seq;
|
||||
+ /* the eviction increments the min generation numbers */
|
||||
+ unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the birth time of each generation in jiffies */
|
||||
+ unsigned long timestamps[MAX_NR_GENS];
|
||||
+ /* the multigenerational lru lists */
|
||||
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* the sizes of the multigenerational lru lists in pages */
|
||||
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* whether the multigenerational lru is enabled */
|
||||
+ bool enabled[ANON_AND_FILE];
|
||||
+};
|
||||
+
|
||||
+#define MAX_BATCH_SIZE 8192
|
||||
+
|
||||
+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
|
||||
+void lru_gen_change_state(bool enable, bool main, bool swap);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+#endif
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_change_state(bool enable, bool main, bool swap)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
/* per lruvec lru_lock for memcg */
|
||||
@@ -311,6 +377,10 @@ struct lruvec {
|
||||
unsigned long refaults[ANON_AND_FILE];
|
||||
/* Various lruvec state flags (enum lruvec_flags) */
|
||||
unsigned long flags;
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* unevictable pages are on LRU_UNEVICTABLE */
|
||||
+ struct lrugen evictable;
|
||||
+#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
#endif
|
||||
--- a/include/linux/page-flags-layout.h
|
||||
+++ b/include/linux/page-flags-layout.h
|
||||
@@ -26,6 +26,14 @@
|
||||
|
||||
#define ZONES_WIDTH ZONES_SHIFT
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
|
||||
+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
|
||||
+#else
|
||||
+#define LRU_GEN_WIDTH 0
|
||||
+#define LRU_REFS_WIDTH 0
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
#ifdef CONFIG_SPARSEMEM
|
||||
#include <asm/sparsemem.h>
|
||||
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
|
||||
@@ -55,7 +63,8 @@
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
||||
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
@@ -89,8 +98,8 @@
|
||||
#define LAST_CPUPID_SHIFT 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
||||
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
||||
#else
|
||||
#define LAST_CPUPID_WIDTH 0
|
||||
@@ -100,8 +109,8 @@
|
||||
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
||||
- > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#error "Not enough bits in page flags"
|
||||
#endif
|
||||
|
||||
--- a/include/linux/page-flags.h
|
||||
+++ b/include/linux/page-flags.h
|
||||
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
|
||||
1UL << PG_private | 1UL << PG_private_2 | \
|
||||
1UL << PG_writeback | 1UL << PG_reserved | \
|
||||
1UL << PG_slab | 1UL << PG_active | \
|
||||
- 1UL << PG_unevictable | __PG_MLOCKED)
|
||||
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
||||
|
||||
/*
|
||||
* Flags checked when a page is prepped for return by the page allocator.
|
||||
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
|
||||
* alloc-free cycle to prevent from reusing the page.
|
||||
*/
|
||||
#define PAGE_FLAGS_CHECK_AT_PREP \
|
||||
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
||||
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
||||
|
||||
#define PAGE_FLAGS_PRIVATE \
|
||||
(1UL << PG_private | 1UL << PG_private_2)
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -911,6 +911,9 @@ struct task_struct {
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned in_user_fault:1;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ unsigned in_nonseq_fault:1;
|
||||
+#endif
|
||||
#ifdef CONFIG_COMPAT_BRK
|
||||
unsigned brk_randomized:1;
|
||||
#endif
|
||||
--- a/kernel/bounds.c
|
||||
+++ b/kernel/bounds.c
|
||||
@@ -22,6 +22,9 @@ int main(void)
|
||||
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||
#endif
|
||||
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
|
||||
+#endif
|
||||
/* End of constants */
|
||||
|
||||
return 0;
|
||||
--- a/kernel/cgroup/cgroup-internal.h
|
||||
+++ b/kernel/cgroup/cgroup-internal.h
|
||||
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
|
||||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
--- a/mm/huge_memory.c
|
||||
+++ b/mm/huge_memory.c
|
||||
@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
|
||||
#ifdef CONFIG_64BIT
|
||||
(1L << PG_arch_2) |
|
||||
#endif
|
||||
- (1L << PG_dirty)));
|
||||
+ (1L << PG_dirty) |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
||||
|
||||
/* ->mapping in first tail page is compound_mapcount */
|
||||
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5241,6 +5241,7 @@ static struct mem_cgroup *mem_cgroup_all
|
||||
memcg->deferred_split_queue.split_queue_len = 0;
|
||||
#endif
|
||||
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||
+ lru_gen_init_memcg(memcg);
|
||||
return memcg;
|
||||
fail:
|
||||
mem_cgroup_id_remove(memcg);
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
unsigned int flags, struct pt_regs *regs)
|
||||
{
|
||||
vm_fault_t ret;
|
||||
+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
if (flags & FAULT_FLAG_USER)
|
||||
mem_cgroup_enter_user_fault();
|
||||
|
||||
+ if (nonseq_fault)
|
||||
+ task_enter_nonseq_fault();
|
||||
+
|
||||
if (unlikely(is_vm_hugetlb_page(vma)))
|
||||
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
||||
else
|
||||
ret = __handle_mm_fault(vma, address, flags);
|
||||
|
||||
+ if (nonseq_fault)
|
||||
+ task_exit_nonseq_fault();
|
||||
+
|
||||
if (flags & FAULT_FLAG_USER) {
|
||||
mem_cgroup_exit_user_fault();
|
||||
/*
|
||||
--- a/mm/mm_init.c
|
||||
+++ b/mm/mm_init.c
|
||||
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
|
||||
|
||||
shift = 8 * sizeof(unsigned long);
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
||||
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
||||
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
||||
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
||||
SECTIONS_WIDTH,
|
||||
NODES_WIDTH,
|
||||
ZONES_WIDTH,
|
||||
LAST_CPUPID_WIDTH,
|
||||
KASAN_TAG_WIDTH,
|
||||
+ LRU_GEN_WIDTH,
|
||||
+ LRU_REFS_WIDTH,
|
||||
NR_PAGEFLAGS);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
||||
--- a/mm/page_alloc.c
|
||||
+++ b/mm/page_alloc.c
|
||||
@@ -7459,6 +7459,7 @@ static void __meminit pgdat_init_interna
|
||||
|
||||
pgdat_page_ext_init(pgdat);
|
||||
lruvec_init(&pgdat->__lruvec);
|
||||
+ lru_gen_init_state(NULL, &pgdat->__lruvec);
|
||||
}
|
||||
|
||||
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
|
||||
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
+ /* see the comment in lru_gen_add_page() */
|
||||
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
|
||||
+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
|
||||
+ SetPageActive(page);
|
||||
+
|
||||
get_page(page);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
||||
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
- if (PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
|
||||
*/
|
||||
void deactivate_page(struct page *page)
|
||||
{
|
||||
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&lru_pvecs.lock);
|
||||
--- a/mm/swapfile.c
|
||||
+++ b/mm/swapfile.c
|
||||
@@ -2689,6 +2689,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
|
||||
err = 0;
|
||||
atomic_inc(&proc_poll_event);
|
||||
wake_up_interruptible(&proc_poll_wait);
|
||||
+ lru_gen_change_state(false, false, true);
|
||||
|
||||
out_dput:
|
||||
filp_close(victim, NULL);
|
||||
@@ -3350,6 +3351,7 @@ SYSCALL_DEFINE2(swapon, const char __use
|
||||
mutex_unlock(&swapon_mutex);
|
||||
atomic_inc(&proc_poll_event);
|
||||
wake_up_interruptible(&proc_poll_wait);
|
||||
+ lru_gen_change_state(true, false, true);
|
||||
|
||||
error = 0;
|
||||
goto out;
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -50,6 +50,7 @@
|
||||
#include <linux/printk.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/psi.h>
|
||||
+#include <linux/memory.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -2815,6 +2816,273 @@ static bool can_age_anon_pages(struct pg
|
||||
return can_demote(pgdat->node_id, sc);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * shorthand helpers
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#define for_each_gen_type_zone(gen, type, zone) \
|
||||
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
||||
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
+
|
||||
+static int page_lru_gen(struct page *page)
|
||||
+{
|
||||
+ unsigned long flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg) {
|
||||
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
||||
+
|
||||
+ if (lruvec->pgdat != pgdat)
|
||||
+ lruvec->pgdat = pgdat;
|
||||
+
|
||||
+ return lruvec;
|
||||
+ }
|
||||
+#endif
|
||||
+ return pgdat ? &pgdat->__lruvec : NULL;
|
||||
+}
|
||||
+
|
||||
+static int get_nr_gens(struct lruvec *lruvec, int type)
|
||||
+{
|
||||
+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
|
||||
+}
|
||||
+
|
||||
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
|
||||
+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
|
||||
+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * state change
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
|
||||
+#else
|
||||
+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
|
||||
+#endif
|
||||
+
|
||||
+static int lru_gen_nr_swapfiles;
|
||||
+
|
||||
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ enum lru_list lru;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ type = is_file_lru(lru);
|
||||
+
|
||||
+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ return false;
|
||||
+
|
||||
+ /* unlikely but not a bug when reset_batch_size() is pending */
|
||||
+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool fill_lists(struct lruvec *lruvec)
|
||||
+{
|
||||
+ enum lru_list lru;
|
||||
+ int remaining = MAX_BATCH_SIZE;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ int type = is_file_lru(lru);
|
||||
+ bool active = is_active_lru(lru);
|
||||
+ struct list_head *head = &lruvec->lists[lru];
|
||||
+
|
||||
+ if (!lruvec->evictable.enabled[type])
|
||||
+ continue;
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
|
||||
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
|
||||
+
|
||||
+ prefetchw_prev_lru_page(page, head, flags);
|
||||
+
|
||||
+ del_page_from_lru_list(page, lruvec);
|
||||
+ success = lru_gen_add_page(page, lruvec, false);
|
||||
+ VM_BUG_ON(!success);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool drain_lists(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ int remaining = MAX_BATCH_SIZE;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
|
||||
+
|
||||
+ if (lruvec->evictable.enabled[type])
|
||||
+ continue;
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
||||
+
|
||||
+ prefetchw_prev_lru_page(page, head, flags);
|
||||
+
|
||||
+ success = lru_gen_del_page(page, lruvec, false);
|
||||
+ VM_BUG_ON(!success);
|
||||
+ add_page_to_lru_list(page, lruvec);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * For file page tracking, we enable/disable it according to the main switch.
|
||||
+ * For anon page tracking, we only enabled it when the main switch is on and
|
||||
+ * there is at least one swapfile; we disable it when there are no swapfiles
|
||||
+ * regardless of the value of the main switch. Otherwise, we will eventually
|
||||
+ * reach the max size of the sliding window and have to call inc_min_seq().
|
||||
+ */
|
||||
+void lru_gen_change_state(bool enable, bool main, bool swap)
|
||||
+{
|
||||
+ static DEFINE_MUTEX(state_mutex);
|
||||
+
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ mem_hotplug_begin();
|
||||
+ cgroup_lock();
|
||||
+ mutex_lock(&state_mutex);
|
||||
+
|
||||
+ if (swap) {
|
||||
+ if (enable)
|
||||
+ swap = !lru_gen_nr_swapfiles++;
|
||||
+ else
|
||||
+ swap = !--lru_gen_nr_swapfiles;
|
||||
+ }
|
||||
+
|
||||
+ if (main && enable != lru_gen_enabled()) {
|
||||
+ if (enable)
|
||||
+ static_branch_enable(&lru_gen_static_key);
|
||||
+ else
|
||||
+ static_branch_disable(&lru_gen_static_key);
|
||||
+ } else if (!swap || !lru_gen_enabled())
|
||||
+ goto unlock;
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
||||
+ VM_BUG_ON(!state_is_valid(lruvec));
|
||||
+
|
||||
+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
||||
+ lruvec->evictable.enabled[1] = lru_gen_enabled();
|
||||
+
|
||||
+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+unlock:
|
||||
+ mutex_unlock(&state_mutex);
|
||||
+ cgroup_unlock();
|
||||
+ mem_hotplug_done();
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * initialization
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
|
||||
+{
|
||||
+ int i;
|
||||
+ int gen, type, zone;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
||||
+ lrugen->enabled[1] = lru_gen_enabled();
|
||||
+
|
||||
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
|
||||
+ lrugen->timestamps[i] = jiffies;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone)
|
||||
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ lru_gen_init_state(memcg, lruvec);
|
||||
+ }
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static int __init init_lru_gen(void)
|
||||
+{
|
||||
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+
|
||||
+ return 0;
|
||||
+};
|
||||
+late_initcall(init_lru_gen);
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
unsigned long nr[NR_LRU_LISTS];
|
|
@ -0,0 +1,807 @@
|
|||
From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:02 -0600
|
||||
Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Evictable pages are divided into multiple generations for each lruvec.
|
||||
The youngest generation number is stored in lrugen->max_seq for both
|
||||
anon and file types as they are aged on an equal footing. The oldest
|
||||
generation numbers are stored in lrugen->min_seq[] separately for anon
|
||||
and file types as clean file pages can be evicted regardless of swap
|
||||
constraints. These three variables are monotonically increasing.
|
||||
|
||||
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
|
||||
in order to fit into the gen counter in page->flags. Each truncated
|
||||
generation number is an index to lrugen->lists[]. The sliding window
|
||||
technique is used to track at least MIN_NR_GENS and at most
|
||||
MAX_NR_GENS generations. The gen counter stores a value within [1,
|
||||
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
|
||||
stores 0.
|
||||
|
||||
There are two conceptually independent procedures: "the aging", which
|
||||
produces young generations, and "the eviction", which consumes old
|
||||
generations. They form a closed-loop system, i.e., "the page reclaim".
|
||||
Both procedures can be invoked from userspace for the purposes of working
|
||||
set estimation and proactive reclaim. These techniques are commonly used
|
||||
to optimize job scheduling (bin packing) in data centers [1][2].
|
||||
|
||||
To avoid confusion, the terms "hot" and "cold" will be applied to the
|
||||
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
|
||||
be applied to the active/inactive LRU, as usual.
|
||||
|
||||
The protection of hot pages and the selection of cold pages are based
|
||||
on page access channels and patterns. There are two access channels:
|
||||
one through page tables and the other through file descriptors. The
|
||||
protection of the former channel is by design stronger because:
|
||||
1. The uncertainty in determining the access patterns of the former
|
||||
channel is higher due to the approximation of the accessed bit.
|
||||
2. The cost of evicting the former channel is higher due to the TLB
|
||||
flushes required and the likelihood of encountering the dirty bit.
|
||||
3. The penalty of underprotecting the former channel is higher because
|
||||
applications usually do not prepare themselves for major page
|
||||
faults like they do for blocked I/O. E.g., GUI applications
|
||||
commonly use dedicated I/O threads to avoid blocking rendering
|
||||
threads.
|
||||
|
||||
There are also two access patterns: one with temporal locality and the
|
||||
other without. For the reasons listed above, the former channel is
|
||||
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
|
||||
present; the latter channel is assumed to follow the latter pattern unless
|
||||
outlying refaults have been observed [3][4].
|
||||
|
||||
The next patch will address the "outlying refaults". Three macros, i.e.,
|
||||
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
|
||||
this patch to make the entire patchset less diffy.
|
||||
|
||||
A page is added to the youngest generation on faulting. The aging needs
|
||||
to check the accessed bit at least twice before handing this page over to
|
||||
the eviction. The first check takes care of the accessed bit set on the
|
||||
initial fault; the second check makes sure this page has not been used
|
||||
since then. This protocol, AKA second chance, requires a minimum of two
|
||||
generations, hence MIN_NR_GENS.
|
||||
|
||||
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||
[3] https://lwn.net/Articles/495543/
|
||||
[4] https://lwn.net/Articles/815342/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
fs/fuse/dev.c | 3 +-
|
||||
include/linux/mm.h | 2 +
|
||||
include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++-
|
||||
include/linux/mmzone.h | 100 +++++++++++++++++
|
||||
include/linux/page-flags-layout.h | 13 ++-
|
||||
include/linux/page-flags.h | 4 +-
|
||||
include/linux/sched.h | 4 +
|
||||
kernel/bounds.c | 5 +
|
||||
mm/Kconfig | 8 ++
|
||||
mm/huge_memory.c | 3 +-
|
||||
mm/memcontrol.c | 2 +
|
||||
mm/memory.c | 25 +++++
|
||||
mm/mm_init.c | 6 +-
|
||||
mm/mmzone.c | 2 +
|
||||
mm/swap.c | 10 +-
|
||||
mm/vmscan.c | 75 +++++++++++++
|
||||
16 files changed, 425 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/fs/fuse/dev.c
|
||||
+++ b/fs/fuse/dev.c
|
||||
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
|
||||
1 << PG_active |
|
||||
1 << PG_workingset |
|
||||
1 << PG_reclaim |
|
||||
- 1 << PG_waiters))) {
|
||||
+ 1 << PG_waiters |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||
dump_page(page, "fuse: trying to steal weird page");
|
||||
return 1;
|
||||
}
|
||||
--- a/include/linux/mm.h
|
||||
+++ b/include/linux/mm.h
|
||||
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
|
||||
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
||||
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
||||
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
||||
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
||||
|
||||
/*
|
||||
* Define the bit shifts to access each section. For non-existent
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
|
||||
|
||||
static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||
enum lru_list lru, enum zone_type zid,
|
||||
- int nr_pages)
|
||||
+ long nr_pages)
|
||||
{
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
+ lockdep_assert_held(&lruvec->lru_lock);
|
||||
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
||||
+
|
||||
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||
@@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
|
||||
return lru;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_in_fault(void)
|
||||
+{
|
||||
+ return current->in_lru_fault;
|
||||
+}
|
||||
+
|
||||
+static inline int lru_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+static inline int page_lru_gen(struct page *page)
|
||||
+{
|
||||
+ unsigned long flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
+{
|
||||
+ unsigned long max_seq = lruvec->lrugen.max_seq;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||
+
|
||||
+ /* see the comment on MIN_NR_GENS */
|
||||
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
|
||||
+ int old_gen, int new_gen)
|
||||
+{
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ int delta = thp_nr_pages(page);
|
||||
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
|
||||
+
|
||||
+ if (old_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
||||
+ lrugen->nr_pages[old_gen][type][zone] - delta);
|
||||
+ if (new_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
||||
+ lrugen->nr_pages[new_gen][type][zone] + delta);
|
||||
+
|
||||
+ /* addition */
|
||||
+ if (old_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, new_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ __update_lru_size(lruvec, lru, zone, delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* deletion */
|
||||
+ if (new_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, old_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ __update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ unsigned long flags;
|
||||
+ int gen = page_lru_gen(page);
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
+
|
||||
+ if (PageUnevictable(page))
|
||||
+ return false;
|
||||
+ /*
|
||||
+ * There are three common cases for this page:
|
||||
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
|
||||
+ * migrated, add it to the youngest generation.
|
||||
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
||||
+ * not in swapcache or a dirty page pending writeback, add it to the
|
||||
+ * second oldest generation.
|
||||
+ * 3. Everything else (clean, cold) is added to the oldest generation.
|
||||
+ */
|
||||
+ if (PageActive(page))
|
||||
+ seq = lrugen->max_seq;
|
||||
+ else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
|
||||
+ (PageReclaim(page) &&
|
||||
+ (PageDirty(page) || PageWriteback(page))))
|
||||
+ seq = lrugen->min_seq[type] + 1;
|
||||
+ else
|
||||
+ seq = lrugen->min_seq[type];
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ /* see the comment on MIN_NR_GENS about PG_active */
|
||||
+ set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, -1, gen);
|
||||
+ /* for rotate_reclaimable_page() */
|
||||
+ if (reclaiming)
|
||||
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ else
|
||||
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ unsigned long flags;
|
||||
+ int gen = page_lru_gen(page);
|
||||
+
|
||||
+ if (gen < 0)
|
||||
+ return false;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+
|
||||
+ /* for migrate_page_states() */
|
||||
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
|
||||
+ flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
|
||||
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, gen, -1);
|
||||
+ list_del(&page->lru);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_in_fault(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static __always_inline void add_page_to_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(lruvec, page, false))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -100,6 +269,9 @@ static __always_inline void add_page_to_
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(lruvec, page, true))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -107,6 +279,9 @@ static __always_inline void add_page_to_
|
||||
static __always_inline void del_page_from_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
+ if (lru_gen_del_page(lruvec, page, false))
|
||||
+ return;
|
||||
+
|
||||
list_del(&page->lru);
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -294,6 +294,102 @@ enum lruvec_flags {
|
||||
*/
|
||||
};
|
||||
|
||||
+#endif /* !__GENERATING_BOUNDS_H */
|
||||
+
|
||||
+/*
|
||||
+ * Evictable pages are divided into multiple generations. The youngest and the
|
||||
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
||||
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||
+ * corresponding generation. The gen counter in page->flags stores gen+1 while
|
||||
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||
+ *
|
||||
+ * A page is added to the youngest generation on faulting. The aging needs to
|
||||
+ * check the accessed bit at least twice before handing this page over to the
|
||||
+ * eviction. The first check takes care of the accessed bit set on the initial
|
||||
+ * fault; the second check makes sure this page hasn't been used since then.
|
||||
+ * This process, AKA second chance, requires a minimum of two generations,
|
||||
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
|
||||
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
|
||||
+ * rest of generations, if they exist, are considered inactive. See
|
||||
+ * lru_gen_is_active().
|
||||
+ *
|
||||
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||
+ * the aging needs not to worry about it. And it's set again when a page
|
||||
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||
+ * See lru_gen_add_page() and lru_gen_del_page().
|
||||
+ *
|
||||
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
||||
+ * number of categories of the active/inactive LRU when keeping track of
|
||||
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
||||
+ * in page->flags.
|
||||
+ */
|
||||
+#define MIN_NR_GENS 2U
|
||||
+#define MAX_NR_GENS 4U
|
||||
+
|
||||
+#ifndef __GENERATING_BOUNDS_H
|
||||
+
|
||||
+struct lruvec;
|
||||
+
|
||||
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+enum {
|
||||
+ LRU_GEN_ANON,
|
||||
+ LRU_GEN_FILE,
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
+ * The youngest generation number is stored in max_seq for both anon and file
|
||||
+ * types as they are aged on an equal footing. The oldest generation numbers are
|
||||
+ * stored in min_seq[] separately for anon and file types as clean file pages
|
||||
+ * can be evicted regardless of swap constraints.
|
||||
+ *
|
||||
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
|
||||
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
|
||||
+ * min_seq behind.
|
||||
+ *
|
||||
+ * The number of pages in each generation is eventually consistent and therefore
|
||||
+ * can be transiently negative.
|
||||
+ */
|
||||
+struct lru_gen_struct {
|
||||
+ /* the aging increments the youngest generation number */
|
||||
+ unsigned long max_seq;
|
||||
+ /* the eviction increments the oldest generation numbers */
|
||||
+ unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* the multi-gen LRU sizes, eventually consistent */
|
||||
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+};
|
||||
+
|
||||
+void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||
+#endif
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
/* per lruvec lru_lock for memcg */
|
||||
@@ -311,6 +407,10 @@ struct lruvec {
|
||||
unsigned long refaults[ANON_AND_FILE];
|
||||
/* Various lruvec state flags (enum lruvec_flags) */
|
||||
unsigned long flags;
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* evictable pages divided into generations */
|
||||
+ struct lru_gen_struct lrugen;
|
||||
+#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
#endif
|
||||
--- a/include/linux/page-flags-layout.h
|
||||
+++ b/include/linux/page-flags-layout.h
|
||||
@@ -55,7 +55,8 @@
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
||||
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
@@ -89,8 +90,8 @@
|
||||
#define LAST_CPUPID_SHIFT 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
||||
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
||||
#else
|
||||
#define LAST_CPUPID_WIDTH 0
|
||||
@@ -100,10 +101,12 @@
|
||||
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
||||
- > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#error "Not enough bits in page flags"
|
||||
#endif
|
||||
|
||||
+#define LRU_REFS_WIDTH 0
|
||||
+
|
||||
#endif
|
||||
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|
||||
--- a/include/linux/page-flags.h
|
||||
+++ b/include/linux/page-flags.h
|
||||
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
|
||||
1UL << PG_private | 1UL << PG_private_2 | \
|
||||
1UL << PG_writeback | 1UL << PG_reserved | \
|
||||
1UL << PG_slab | 1UL << PG_active | \
|
||||
- 1UL << PG_unevictable | __PG_MLOCKED)
|
||||
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
||||
|
||||
/*
|
||||
* Flags checked when a page is prepped for return by the page allocator.
|
||||
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
|
||||
* alloc-free cycle to prevent from reusing the page.
|
||||
*/
|
||||
#define PAGE_FLAGS_CHECK_AT_PREP \
|
||||
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
||||
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
||||
|
||||
#define PAGE_FLAGS_PRIVATE \
|
||||
(1UL << PG_private | 1UL << PG_private_2)
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -911,6 +911,10 @@ struct task_struct {
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned in_user_fault:1;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* whether the LRU algorithm may apply to this access */
|
||||
+ unsigned in_lru_fault:1;
|
||||
+#endif
|
||||
#ifdef CONFIG_COMPAT_BRK
|
||||
unsigned brk_randomized:1;
|
||||
#endif
|
||||
--- a/kernel/bounds.c
|
||||
+++ b/kernel/bounds.c
|
||||
@@ -22,6 +22,11 @@ int main(void)
|
||||
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||
#endif
|
||||
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
|
||||
+#else
|
||||
+ DEFINE(LRU_GEN_WIDTH, 0);
|
||||
+#endif
|
||||
/* End of constants */
|
||||
|
||||
return 0;
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -897,6 +897,14 @@ config IO_MAPPING
|
||||
config SECRETMEM
|
||||
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
|
||||
|
||||
+config LRU_GEN
|
||||
+ bool "Multi-Gen LRU"
|
||||
+ depends on MMU
|
||||
+ # make sure page->flags has enough spare bits
|
||||
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
|
||||
+ help
|
||||
+ A high performance LRU implementation to overcommit memory.
|
||||
+
|
||||
source "mm/damon/Kconfig"
|
||||
|
||||
endmenu
|
||||
--- a/mm/huge_memory.c
|
||||
+++ b/mm/huge_memory.c
|
||||
@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
|
||||
#ifdef CONFIG_64BIT
|
||||
(1L << PG_arch_2) |
|
||||
#endif
|
||||
- (1L << PG_dirty)));
|
||||
+ (1L << PG_dirty) |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
||||
|
||||
/* ->mapping in first tail page is compound_mapcount */
|
||||
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
|
||||
|
||||
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
||||
{
|
||||
+ lru_gen_exit_memcg(memcg);
|
||||
memcg_wb_domain_exit(memcg);
|
||||
__mem_cgroup_free(memcg);
|
||||
}
|
||||
@@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_all
|
||||
memcg->deferred_split_queue.split_queue_len = 0;
|
||||
#endif
|
||||
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||
+ lru_gen_init_memcg(memcg);
|
||||
return memcg;
|
||||
fail:
|
||||
mem_cgroup_id_remove(memcg);
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -4778,6 +4778,27 @@ static inline void mm_account_fault(stru
|
||||
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_exit_fault(void)
|
||||
+{
|
||||
+ current->in_lru_fault = false;
|
||||
+}
|
||||
+#else
|
||||
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_exit_fault(void)
|
||||
+{
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
/*
|
||||
* By the time we get here, we already hold the mm semaphore
|
||||
*
|
||||
@@ -4809,11 +4830,15 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
if (flags & FAULT_FLAG_USER)
|
||||
mem_cgroup_enter_user_fault();
|
||||
|
||||
+ lru_gen_enter_fault(vma);
|
||||
+
|
||||
if (unlikely(is_vm_hugetlb_page(vma)))
|
||||
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
||||
else
|
||||
ret = __handle_mm_fault(vma, address, flags);
|
||||
|
||||
+ lru_gen_exit_fault();
|
||||
+
|
||||
if (flags & FAULT_FLAG_USER) {
|
||||
mem_cgroup_exit_user_fault();
|
||||
/*
|
||||
--- a/mm/mm_init.c
|
||||
+++ b/mm/mm_init.c
|
||||
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
|
||||
|
||||
shift = 8 * sizeof(unsigned long);
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
||||
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
||||
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
||||
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
||||
SECTIONS_WIDTH,
|
||||
NODES_WIDTH,
|
||||
ZONES_WIDTH,
|
||||
LAST_CPUPID_WIDTH,
|
||||
KASAN_TAG_WIDTH,
|
||||
+ LRU_GEN_WIDTH,
|
||||
+ LRU_REFS_WIDTH,
|
||||
NR_PAGEFLAGS);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
||||
--- a/mm/mmzone.c
|
||||
+++ b/mm/mmzone.c
|
||||
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
|
||||
|
||||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
+
|
||||
+ lru_gen_init_lruvec(lruvec);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
|
||||
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
+ /* see the comment in lru_gen_add_page() */
|
||||
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
|
||||
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
||||
+ SetPageActive(page);
|
||||
+
|
||||
get_page(page);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
||||
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
- if (PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
@@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
|
||||
*/
|
||||
void deactivate_page(struct page *page)
|
||||
{
|
||||
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (PageLRU(page) && !PageUnevictable(page) &&
|
||||
+ (PageActive(page) || lru_gen_enabled())) {
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&lru_pvecs.lock);
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
|
||||
return can_demote(pgdat->node_id, sc);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * shorthand helpers
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#define for_each_gen_type_zone(gen, type, zone) \
|
||||
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
||||
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
+
|
||||
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
+{
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg) {
|
||||
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
||||
+
|
||||
+ /* for hotadd_new_pgdat() */
|
||||
+ if (!lruvec->pgdat)
|
||||
+ lruvec->pgdat = pgdat;
|
||||
+
|
||||
+ return lruvec;
|
||||
+ }
|
||||
+#endif
|
||||
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
|
||||
+
|
||||
+ return pgdat ? &pgdat->__lruvec : NULL;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * initialization
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone)
|
||||
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
||||
+ sizeof(lruvec->lrugen.nr_pages)));
|
||||
+ }
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static int __init init_lru_gen(void)
|
||||
+{
|
||||
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+
|
||||
+ return 0;
|
||||
+};
|
||||
+late_initcall(init_lru_gen);
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
unsigned long nr[NR_LRU_LISTS];
|
|
@ -1,760 +0,0 @@
|
|||
From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 5 Apr 2021 04:17:41 -0600
|
||||
Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
|
||||
|
||||
To scan PTEs for accessed pages, a mm_struct list is maintained for
|
||||
each memcg. When multiple threads traverse the same memcg->mm_list,
|
||||
each of them gets a unique mm_struct and therefore they can run
|
||||
walk_page_range() concurrently to reach page tables of all processes
|
||||
of this memcg.
|
||||
|
||||
This infrastructure also provides the following optimizations:
|
||||
1) it allows walkers to skip processes that have been sleeping since
|
||||
the last walk by tracking the usage of mm_struct between context
|
||||
switches.
|
||||
2) it allows walkers to add interesting items they find during a
|
||||
walk to a Bloom filter so that they can skip uninteresting items
|
||||
during the next walk by testing whether an item is in this Bloom
|
||||
filter.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
|
||||
---
|
||||
fs/exec.c | 2 +
|
||||
include/linux/memcontrol.h | 4 +
|
||||
include/linux/mm_inline.h | 6 +
|
||||
include/linux/mm_types.h | 75 +++++++++
|
||||
include/linux/mmzone.h | 63 +++++++
|
||||
kernel/exit.c | 1 +
|
||||
kernel/fork.c | 9 +
|
||||
kernel/sched/core.c | 1 +
|
||||
mm/memcontrol.c | 25 +++
|
||||
mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++
|
||||
10 files changed, 517 insertions(+)
|
||||
|
||||
--- a/fs/exec.c
|
||||
+++ b/fs/exec.c
|
||||
@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
active_mm = tsk->active_mm;
|
||||
tsk->active_mm = mm;
|
||||
tsk->mm = mm;
|
||||
+ lru_gen_add_mm(mm);
|
||||
/*
|
||||
* This prevents preemption while active_mm is being loaded and
|
||||
* it and mm are being updated, which could cause problems for
|
||||
@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
||||
local_irq_enable();
|
||||
activate_mm(active_mm, mm);
|
||||
+ lru_gen_activate_mm(mm);
|
||||
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
||||
local_irq_enable();
|
||||
tsk->mm->vmacache_seqnum = 0;
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -348,6 +348,10 @@ struct mem_cgroup {
|
||||
struct deferred_split deferred_split_queue;
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ struct lru_gen_mm_list mm_list;
|
||||
+#endif
|
||||
+
|
||||
struct mem_cgroup_per_node *nodeinfo[];
|
||||
};
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
|
||||
return seq % MAX_NR_GENS;
|
||||
}
|
||||
|
||||
+/* Return a proper index regardless whether we keep stats for historical generations. */
|
||||
+static inline int lru_hist_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % NR_HIST_GENS;
|
||||
+}
|
||||
+
|
||||
/* The youngest and the second youngest generations are counted as active. */
|
||||
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
{
|
||||
--- a/include/linux/mm_types.h
|
||||
+++ b/include/linux/mm_types.h
|
||||
@@ -3,6 +3,7 @@
|
||||
#define _LINUX_MM_TYPES_H
|
||||
|
||||
#include <linux/mm_types_task.h>
|
||||
+#include <linux/sched.h>
|
||||
|
||||
#include <linux/auxvec.h>
|
||||
#include <linux/list.h>
|
||||
@@ -15,6 +16,8 @@
|
||||
#include <linux/page-flags-layout.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/seqlock.h>
|
||||
+#include <linux/nodemask.h>
|
||||
+#include <linux/mmdebug.h>
|
||||
|
||||
#include <asm/mmu.h>
|
||||
|
||||
@@ -580,6 +583,18 @@ struct mm_struct {
|
||||
#ifdef CONFIG_IOMMU_SUPPORT
|
||||
u32 pasid;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ struct {
|
||||
+ /* the node of a global or per-memcg mm_struct list */
|
||||
+ struct list_head list;
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ /* points to the memcg of the owner task above */
|
||||
+ struct mem_cgroup *memcg;
|
||||
+#endif
|
||||
+ /* whether this mm_struct has been used since the last walk */
|
||||
+ nodemask_t nodes;
|
||||
+ } lrugen;
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
} __randomize_layout;
|
||||
|
||||
/*
|
||||
@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
|
||||
return (struct cpumask *)&mm->cpu_bitmap;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+struct lru_gen_mm_list {
|
||||
+ /* a global or per-memcg mm_struct list */
|
||||
+ struct list_head fifo;
|
||||
+ /* protects the list above */
|
||||
+ spinlock_t lock;
|
||||
+};
|
||||
+
|
||||
+void lru_gen_add_mm(struct mm_struct *mm);
|
||||
+void lru_gen_del_mm(struct mm_struct *mm);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_migrate_mm(struct mm_struct *mm);
|
||||
+#endif
|
||||
+
|
||||
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ INIT_LIST_HEAD(&mm->lrugen.list);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ mm->lrugen.memcg = NULL;
|
||||
+#endif
|
||||
+ nodes_clear(mm->lrugen.nodes);
|
||||
+}
|
||||
+
|
||||
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
|
||||
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */
|
||||
+ VM_WARN_ON(list_empty(&mm->lrugen.list));
|
||||
+
|
||||
+ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
|
||||
+ nodes_setall(mm->lrugen.nodes);
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_add_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_del_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct mmu_gather;
|
||||
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
|
||||
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -318,6 +318,13 @@ struct lruvec;
|
||||
#define MIN_NR_GENS 2
|
||||
#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
||||
|
||||
+/* Whether to keep stats for historical generations. */
|
||||
+#ifdef CONFIG_LRU_GEN_STATS
|
||||
+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
||||
+#else
|
||||
+#define NR_HIST_GENS 1U
|
||||
+#endif
|
||||
+
|
||||
struct lrugen {
|
||||
/* the aging increments the max generation number */
|
||||
unsigned long max_seq;
|
||||
@@ -333,13 +340,63 @@ struct lrugen {
|
||||
bool enabled[ANON_AND_FILE];
|
||||
};
|
||||
|
||||
+enum {
|
||||
+ MM_LEAF_TOTAL, /* total leaf entries */
|
||||
+ MM_LEAF_OLD, /* old leaf entries */
|
||||
+ MM_LEAF_YOUNG, /* young leaf entries */
|
||||
+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
|
||||
+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */
|
||||
+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */
|
||||
+ NR_MM_STATS
|
||||
+};
|
||||
+
|
||||
+/* mnemonic codes for the stats above */
|
||||
+#define MM_STAT_CODES "toydpc"
|
||||
+
|
||||
+/* double buffering bloom filters */
|
||||
+#define NR_BLOOM_FILTERS 2
|
||||
+
|
||||
+struct lru_gen_mm_walk {
|
||||
+ /* set to max_seq after each round of walk */
|
||||
+ unsigned long seq;
|
||||
+ /* the next mm_struct on the list to walk */
|
||||
+ struct list_head *head;
|
||||
+ /* the first mm_struct never walked before */
|
||||
+ struct list_head *tail;
|
||||
+ /* to wait for the last walker to finish */
|
||||
+ struct wait_queue_head wait;
|
||||
+ /* bloom filters flip after each round of walk */
|
||||
+ unsigned long *filters[NR_BLOOM_FILTERS];
|
||||
+ /* page table stats for debugging */
|
||||
+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
|
||||
+ /* the number of concurrent walkers */
|
||||
+ int nr_walkers;
|
||||
+};
|
||||
+
|
||||
+#define MIN_BATCH_SIZE 64
|
||||
#define MAX_BATCH_SIZE 8192
|
||||
|
||||
+struct mm_walk_args {
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ unsigned long max_seq;
|
||||
+ unsigned long start_pfn;
|
||||
+ unsigned long end_pfn;
|
||||
+ unsigned long next_addr;
|
||||
+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
|
||||
+ int node_id;
|
||||
+ int swappiness;
|
||||
+ int batch_size;
|
||||
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ int mm_stats[NR_MM_STATS];
|
||||
+ bool use_filter;
|
||||
+};
|
||||
+
|
||||
void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
|
||||
void lru_gen_change_state(bool enable, bool main, bool swap);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_free_memcg(struct mem_cgroup *memcg);
|
||||
#endif
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
+
|
||||
+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
@@ -380,6 +441,8 @@ struct lruvec {
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* unevictable pages are on LRU_UNEVICTABLE */
|
||||
struct lrugen evictable;
|
||||
+ /* state for mm list and page table walks */
|
||||
+ struct lru_gen_mm_walk mm_walk;
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
--- a/kernel/exit.c
|
||||
+++ b/kernel/exit.c
|
||||
@@ -469,6 +469,7 @@ assign_new_owner:
|
||||
goto retry;
|
||||
}
|
||||
WRITE_ONCE(mm->owner, c);
|
||||
+ lru_gen_migrate_mm(mm);
|
||||
task_unlock(c);
|
||||
put_task_struct(c);
|
||||
}
|
||||
--- a/kernel/fork.c
|
||||
+++ b/kernel/fork.c
|
||||
@@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct
|
||||
goto fail_nocontext;
|
||||
|
||||
mm->user_ns = get_user_ns(user_ns);
|
||||
+ lru_gen_init_mm(mm);
|
||||
return mm;
|
||||
|
||||
fail_nocontext:
|
||||
@@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str
|
||||
}
|
||||
if (mm->binfmt)
|
||||
module_put(mm->binfmt->module);
|
||||
+ lru_gen_del_mm(mm);
|
||||
mmdrop(mm);
|
||||
}
|
||||
|
||||
@@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_a
|
||||
get_task_struct(p);
|
||||
}
|
||||
|
||||
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
|
||||
+ /* lock the task to synchronize with memcg migration */
|
||||
+ task_lock(p);
|
||||
+ lru_gen_add_mm(p->mm);
|
||||
+ task_unlock(p);
|
||||
+ }
|
||||
+
|
||||
wake_up_new_task(p);
|
||||
|
||||
/* forking complete and child started to run, tell ptracer */
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -5007,6 +5007,7 @@ context_switch(struct rq *rq, struct tas
|
||||
* finish_task_switch()'s mmdrop().
|
||||
*/
|
||||
switch_mm_irqs_off(prev->active_mm, next->mm, next);
|
||||
+ lru_gen_activate_mm(next->mm);
|
||||
|
||||
if (!prev->mm) { // from kernel
|
||||
/* will mmdrop() in finish_task_switch(). */
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
|
||||
|
||||
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
||||
{
|
||||
+ lru_gen_free_memcg(memcg);
|
||||
memcg_wb_domain_exit(memcg);
|
||||
__mem_cgroup_free(memcg);
|
||||
}
|
||||
@@ -6210,6 +6211,29 @@ static void mem_cgroup_move_task(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ struct task_struct *task = NULL;
|
||||
+
|
||||
+ cgroup_taskset_for_each_leader(task, css, tset)
|
||||
+ break;
|
||||
+
|
||||
+ if (!task)
|
||||
+ return;
|
||||
+
|
||||
+ task_lock(task);
|
||||
+ if (task->mm && task->mm->owner == task)
|
||||
+ lru_gen_migrate_mm(task->mm);
|
||||
+ task_unlock(task);
|
||||
+}
|
||||
+#else
|
||||
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
||||
+{
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
|
||||
{
|
||||
if (value == PAGE_COUNTER_MAX)
|
||||
@@ -6553,6 +6577,7 @@ struct cgroup_subsys memory_cgrp_subsys
|
||||
.css_reset = mem_cgroup_css_reset,
|
||||
.css_rstat_flush = mem_cgroup_css_rstat_flush,
|
||||
.can_attach = mem_cgroup_can_attach,
|
||||
+ .attach = mem_cgroup_attach,
|
||||
.cancel_attach = mem_cgroup_cancel_attach,
|
||||
.post_attach = mem_cgroup_move_task,
|
||||
.dfl_cftypes = memory_files,
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2864,6 +2864,306 @@ static bool __maybe_unused seq_is_valid(
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * mm_struct list
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ static struct lru_gen_mm_list mm_list = {
|
||||
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
|
||||
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
|
||||
+ };
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ return &memcg->mm_list;
|
||||
+#endif
|
||||
+ return &mm_list;
|
||||
+}
|
||||
+
|
||||
+void lru_gen_add_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ int nid;
|
||||
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
|
||||
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||||
+
|
||||
+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
|
||||
+ mm->lrugen.memcg = memcg;
|
||||
+#endif
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ list_add_tail(&mm->lrugen.list, &mm_list->fifo);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ if (lruvec->mm_walk.tail == &mm_list->fifo)
|
||||
+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
+}
|
||||
+
|
||||
+void lru_gen_del_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ int nid;
|
||||
+ struct lru_gen_mm_list *mm_list;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (list_empty(&mm->lrugen.list))
|
||||
+ return;
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ memcg = mm->lrugen.memcg;
|
||||
+#endif
|
||||
+ mm_list = get_mm_list(memcg);
|
||||
+
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ if (lruvec->mm_walk.tail == &mm->lrugen.list)
|
||||
+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
|
||||
+
|
||||
+ if (lruvec->mm_walk.head != &mm->lrugen.list)
|
||||
+ continue;
|
||||
+
|
||||
+ lruvec->mm_walk.head = lruvec->mm_walk.head->next;
|
||||
+ if (lruvec->mm_walk.head == &mm_list->fifo)
|
||||
+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
|
||||
+ }
|
||||
+
|
||||
+ list_del_init(&mm->lrugen.list);
|
||||
+
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ mem_cgroup_put(mm->lrugen.memcg);
|
||||
+ mm->lrugen.memcg = NULL;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_migrate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ lockdep_assert_held(&mm->owner->alloc_lock);
|
||||
+
|
||||
+ if (mem_cgroup_disabled())
|
||||
+ return;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_task(mm->owner);
|
||||
+ rcu_read_unlock();
|
||||
+ if (memcg == mm->lrugen.memcg)
|
||||
+ return;
|
||||
+
|
||||
+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
|
||||
+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
|
||||
+
|
||||
+ lru_gen_del_mm(mm);
|
||||
+ lru_gen_add_mm(mm);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#define BLOOM_FILTER_SHIFT 15
|
||||
+
|
||||
+static inline int filter_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % NR_BLOOM_FILTERS;
|
||||
+}
|
||||
+
|
||||
+static void get_item_key(void *item, int *key)
|
||||
+{
|
||||
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
|
||||
+
|
||||
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
|
||||
+
|
||||
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
|
||||
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
|
||||
+}
|
||||
+
|
||||
+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
|
||||
+{
|
||||
+ unsigned long *filter;
|
||||
+ int gen = filter_gen_from_seq(seq);
|
||||
+
|
||||
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
|
||||
+
|
||||
+ filter = lruvec->mm_walk.filters[gen];
|
||||
+ if (filter) {
|
||||
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
|
||||
+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
|
||||
+}
|
||||
+
|
||||
+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
||||
+{
|
||||
+ int key[2];
|
||||
+ unsigned long *filter;
|
||||
+ int gen = filter_gen_from_seq(seq);
|
||||
+
|
||||
+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
|
||||
+ if (!filter)
|
||||
+ return;
|
||||
+
|
||||
+ get_item_key(item, key);
|
||||
+
|
||||
+ if (!test_bit(key[0], filter))
|
||||
+ set_bit(key[0], filter);
|
||||
+ if (!test_bit(key[1], filter))
|
||||
+ set_bit(key[1], filter);
|
||||
+}
|
||||
+
|
||||
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
||||
+{
|
||||
+ int key[2];
|
||||
+ unsigned long *filter;
|
||||
+ int gen = filter_gen_from_seq(seq);
|
||||
+
|
||||
+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
|
||||
+ if (!filter)
|
||||
+ return false;
|
||||
+
|
||||
+ get_item_key(item, key);
|
||||
+
|
||||
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
|
||||
+}
|
||||
+
|
||||
+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
|
||||
+{
|
||||
+ int i;
|
||||
+ int hist = lru_hist_from_seq(args->max_seq);
|
||||
+
|
||||
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
|
||||
+
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
|
||||
+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
|
||||
+ args->mm_stats[i] = 0;
|
||||
+ }
|
||||
+
|
||||
+ if (!last || NR_HIST_GENS == 1)
|
||||
+ return;
|
||||
+
|
||||
+ hist = lru_hist_from_seq(args->max_seq + 1);
|
||||
+ for (i = 0; i < NR_MM_STATS; i++)
|
||||
+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
|
||||
+}
|
||||
+
|
||||
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
|
||||
+{
|
||||
+ int type;
|
||||
+ unsigned long size = 0;
|
||||
+
|
||||
+ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
|
||||
+ return true;
|
||||
+
|
||||
+ if (mm_is_oom_victim(mm))
|
||||
+ return true;
|
||||
+
|
||||
+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
|
||||
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
|
||||
+ get_mm_counter(mm, MM_ANONPAGES) +
|
||||
+ get_mm_counter(mm, MM_SHMEMPAGES);
|
||||
+ }
|
||||
+
|
||||
+ if (size < MIN_BATCH_SIZE)
|
||||
+ return true;
|
||||
+
|
||||
+ if (!mmget_not_zero(mm))
|
||||
+ return true;
|
||||
+
|
||||
+ node_clear(args->node_id, mm->lrugen.nodes);
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/* To support multiple walkers that concurrently walk an mm_struct list. */
|
||||
+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
|
||||
+ struct mm_struct **iter)
|
||||
+{
|
||||
+ bool first = false;
|
||||
+ bool last = true;
|
||||
+ struct mm_struct *mm = NULL;
|
||||
+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
|
||||
+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
|
||||
+
|
||||
+ if (*iter)
|
||||
+ mmput_async(*iter);
|
||||
+ else if (args->max_seq <= READ_ONCE(mm_walk->seq))
|
||||
+ return false;
|
||||
+
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
|
||||
+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
|
||||
+ VM_BUG_ON(*iter && !mm_walk->nr_walkers);
|
||||
+
|
||||
+ if (args->max_seq <= mm_walk->seq) {
|
||||
+ if (!*iter)
|
||||
+ last = false;
|
||||
+ goto done;
|
||||
+ }
|
||||
+
|
||||
+ if (mm_walk->head == &mm_list->fifo) {
|
||||
+ VM_BUG_ON(mm_walk->nr_walkers);
|
||||
+ mm_walk->head = mm_walk->head->next;
|
||||
+ first = true;
|
||||
+ }
|
||||
+
|
||||
+ while (!mm && mm_walk->head != &mm_list->fifo) {
|
||||
+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
|
||||
+
|
||||
+ mm_walk->head = mm_walk->head->next;
|
||||
+
|
||||
+ if (mm_walk->tail == &mm->lrugen.list) {
|
||||
+ mm_walk->tail = mm_walk->tail->next;
|
||||
+ args->use_filter = false;
|
||||
+ }
|
||||
+
|
||||
+ if (should_skip_mm(mm, args))
|
||||
+ mm = NULL;
|
||||
+ }
|
||||
+
|
||||
+ if (mm_walk->head == &mm_list->fifo)
|
||||
+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
|
||||
+done:
|
||||
+ if (*iter && !mm)
|
||||
+ mm_walk->nr_walkers--;
|
||||
+ if (!*iter && mm)
|
||||
+ mm_walk->nr_walkers++;
|
||||
+
|
||||
+ if (mm_walk->nr_walkers)
|
||||
+ last = false;
|
||||
+
|
||||
+ if (mm && first)
|
||||
+ clear_bloom_filter(lruvec, args->max_seq + 1);
|
||||
+
|
||||
+ if (*iter || last)
|
||||
+ reset_mm_stats(lruvec, last, args);
|
||||
+
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
+
|
||||
+ *iter = mm;
|
||||
+
|
||||
+ return last;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
* state change
|
||||
******************************************************************************/
|
||||
|
||||
@@ -3047,6 +3347,7 @@ void lru_gen_init_state(struct mem_cgrou
|
||||
int i;
|
||||
int gen, type, zone;
|
||||
struct lrugen *lrugen = &lruvec->evictable;
|
||||
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
||||
@@ -3057,6 +3358,17 @@ void lru_gen_init_state(struct mem_cgrou
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ lruvec->mm_walk.seq = MIN_NR_GENS;
|
||||
+ lruvec->mm_walk.head = &mm_list->fifo;
|
||||
+ lruvec->mm_walk.tail = &mm_list->fifo;
|
||||
+ init_waitqueue_head(&lruvec->mm_walk.wait);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
@@ -3064,18 +3376,37 @@ void lru_gen_init_memcg(struct mem_cgrou
|
||||
{
|
||||
int nid;
|
||||
|
||||
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
|
||||
+ spin_lock_init(&memcg->mm_list.lock);
|
||||
+
|
||||
for_each_node(nid) {
|
||||
struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
|
||||
lru_gen_init_state(memcg, lruvec);
|
||||
}
|
||||
}
|
||||
+
|
||||
+void lru_gen_free_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ int i;
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
|
||||
+ bitmap_free(lruvec->mm_walk.filters[i]);
|
||||
+ lruvec->mm_walk.filters[i] = NULL;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
#endif
|
||||
|
||||
static int __init init_lru_gen(void)
|
||||
{
|
||||
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
||||
|
||||
return 0;
|
||||
};
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,491 @@
|
|||
From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:04 -0600
|
||||
Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Searching the rmap for PTEs mapping each page on an LRU list (to test and
|
||||
clear the accessed bit) can be expensive because pages from different VMAs
|
||||
(PA space) are not cache friendly to the rmap (VA space). For workloads
|
||||
mostly using mapped pages, searching the rmap can incur the highest CPU
|
||||
cost in the reclaim path.
|
||||
|
||||
This patch exploits spatial locality to reduce the trips into the rmap.
|
||||
When shrink_page_list() walks the rmap and finds a young PTE, a new
|
||||
function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
|
||||
PTEs. On finding another young PTE, it clears the accessed bit and
|
||||
updates the gen counter of the page mapped by this PTE to
|
||||
(max_seq%MAX_NR_GENS)+1.
|
||||
|
||||
Server benchmark results:
|
||||
Single workload:
|
||||
fio (buffered I/O): no change
|
||||
|
||||
Single workload:
|
||||
memcached (anon): +[3, 5]%
|
||||
Ops/sec KB/sec
|
||||
patch1-6: 1106168.46 43025.04
|
||||
patch1-7: 1147696.57 44640.29
|
||||
|
||||
Configurations:
|
||||
no change
|
||||
|
||||
Client benchmark results:
|
||||
kswapd profiles:
|
||||
patch1-6
|
||||
39.03% lzo1x_1_do_compress (real work)
|
||||
18.47% page_vma_mapped_walk (overhead)
|
||||
6.74% _raw_spin_unlock_irq
|
||||
3.97% do_raw_spin_lock
|
||||
2.49% ptep_clear_flush
|
||||
2.48% anon_vma_interval_tree_iter_first
|
||||
1.92% page_referenced_one
|
||||
1.88% __zram_bvec_write
|
||||
1.48% memmove
|
||||
1.31% vma_interval_tree_iter_next
|
||||
|
||||
patch1-7
|
||||
48.16% lzo1x_1_do_compress (real work)
|
||||
8.20% page_vma_mapped_walk (overhead)
|
||||
7.06% _raw_spin_unlock_irq
|
||||
2.92% ptep_clear_flush
|
||||
2.53% __zram_bvec_write
|
||||
2.11% do_raw_spin_lock
|
||||
2.02% memmove
|
||||
1.93% lru_gen_look_around
|
||||
1.56% free_unref_page_list
|
||||
1.40% memset
|
||||
|
||||
Configurations:
|
||||
no change
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/memcontrol.h | 31 +++++++
|
||||
include/linux/mmzone.h | 6 ++
|
||||
mm/internal.h | 1 +
|
||||
mm/memcontrol.c | 1 +
|
||||
mm/rmap.c | 7 ++
|
||||
mm/swap.c | 4 +-
|
||||
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
|
||||
7 files changed, 232 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*
|
||||
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||
* associated with a kmem page from being released.
|
||||
@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_me
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*
|
||||
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||
* associated with a kmem page from being released.
|
||||
@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page
|
||||
|
||||
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
|
||||
|
||||
+/* try to stablize page_memcg() for all the pages in a memcg */
|
||||
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
|
||||
+ return true;
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline void mem_cgroup_unlock_pages(void)
|
||||
+{
|
||||
+ rcu_read_unlock();
|
||||
+}
|
||||
+
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx, int val)
|
||||
@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(str
|
||||
{
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ /* to match page_memcg_rcu() */
|
||||
+ rcu_read_lock();
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline void mem_cgroup_unlock_pages(void)
|
||||
+{
|
||||
+ rcu_read_unlock();
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_handle_over_high(void)
|
||||
{
|
||||
}
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -352,6 +352,7 @@ enum lruvec_flags {
|
||||
#ifndef __GENERATING_BOUNDS_H
|
||||
|
||||
struct lruvec;
|
||||
+struct page_vma_mapped_walk;
|
||||
|
||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
@@ -407,6 +408,7 @@ struct lru_gen_struct {
|
||||
};
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(s
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_MEMCG
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
--- a/mm/internal.h
|
||||
+++ b/mm/internal.h
|
||||
@@ -35,6 +35,7 @@
|
||||
void page_writeback_init(void);
|
||||
|
||||
vm_fault_t do_swap_page(struct vm_fault *vmf);
|
||||
+void activate_page(struct page *page);
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||
unsigned long floor, unsigned long ceiling);
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *p
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*/
|
||||
page->memcg_data = (unsigned long)memcg;
|
||||
}
|
||||
--- a/mm/rmap.c
|
||||
+++ b/mm/rmap.c
|
||||
@@ -73,6 +73,7 @@
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/userfaultfd_k.h>
|
||||
+#include <linux/mm_inline.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@@ -793,6 +794,12 @@ static bool page_referenced_one(struct p
|
||||
}
|
||||
|
||||
if (pvmw.pte) {
|
||||
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||
+ lru_gen_look_around(&pvmw);
|
||||
+ referenced++;
|
||||
+ }
|
||||
+
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
pvmw.pte)) {
|
||||
/*
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int
|
||||
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
|
||||
}
|
||||
|
||||
-static void activate_page(struct page *page)
|
||||
+void activate_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
@@ -345,7 +345,7 @@ static inline void activate_page_drain(i
|
||||
{
|
||||
}
|
||||
|
||||
-static void activate_page(struct page *page)
|
||||
+void activate_page(struct page *page)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -1409,6 +1409,11 @@ retry:
|
||||
if (!sc->may_unmap && page_mapped(page))
|
||||
goto keep_locked;
|
||||
|
||||
+ /* page_update_gen() tried to promote this page? */
|
||||
+ if (lru_gen_enabled() && !ignore_references &&
|
||||
+ page_mapped(page) && PageReferenced(page))
|
||||
+ goto keep_locked;
|
||||
+
|
||||
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
||||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
||||
|
||||
@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctr
|
||||
* the aging
|
||||
******************************************************************************/
|
||||
|
||||
+/* promote pages accessed through page tables */
|
||||
+static int page_update_gen(struct page *page, int gen)
|
||||
+{
|
||||
+ unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
+
|
||||
+ do {
|
||||
+ /* lru_gen_del_page() has isolated this page? */
|
||||
+ if (!(old_flags & LRU_GEN_MASK)) {
|
||||
+ /* for shrink_page_list() */
|
||||
+ new_flags = old_flags | BIT(PG_referenced);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
|
||||
+
|
||||
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
/* protect pages accessed multiple times through file descriptors */
|
||||
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *l
|
||||
VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
|
||||
|
||||
do {
|
||||
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+ /* page_update_gen() has promoted this page? */
|
||||
+ if (new_gen >= 0 && new_gen != old_gen)
|
||||
+ return new_gen;
|
||||
+
|
||||
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
||||
|
||||
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||
@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *l
|
||||
return new_gen;
|
||||
}
|
||||
|
||||
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
|
||||
+{
|
||||
+ unsigned long pfn = pte_pfn(pte);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
|
||||
+
|
||||
+ if (!pte_present(pte) || is_zero_pfn(pfn))
|
||||
+ return -1;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
|
||||
+ return -1;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
||||
+ return -1;
|
||||
+
|
||||
+ return pfn;
|
||||
+}
|
||||
+
|
||||
+static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
|
||||
+ struct pglist_data *pgdat)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+
|
||||
+ /* try to avoid unnecessary memory loads */
|
||||
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
|
||||
+ return NULL;
|
||||
+
|
||||
+ page = compound_head(pfn_to_page(pfn));
|
||||
+ if (page_to_nid(page) != pgdat->node_id)
|
||||
+ return NULL;
|
||||
+
|
||||
+ if (page_memcg_rcu(page) != memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return page;
|
||||
+}
|
||||
+
|
||||
static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||
{
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pgli
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * This function exploits spatial locality when shrink_page_list() walks the
|
||||
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
|
||||
+ */
|
||||
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||
+{
|
||||
+ int i;
|
||||
+ pte_t *pte;
|
||||
+ unsigned long start;
|
||||
+ unsigned long end;
|
||||
+ unsigned long addr;
|
||||
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
||||
+ struct page *page = pvmw->page;
|
||||
+ struct mem_cgroup *memcg = page_memcg(page);
|
||||
+ struct pglist_data *pgdat = page_pgdat(page);
|
||||
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
|
||||
+
|
||||
+ lockdep_assert_held(pvmw->ptl);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
|
||||
+
|
||||
+ if (spin_is_contended(pvmw->ptl))
|
||||
+ return;
|
||||
+
|
||||
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
|
||||
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
|
||||
+
|
||||
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
|
||||
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
|
||||
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
|
||||
+ else {
|
||||
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ arch_enter_lazy_mmu_mode();
|
||||
+
|
||||
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
|
||||
+ unsigned long pfn;
|
||||
+
|
||||
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
|
||||
+ if (pfn == -1)
|
||||
+ continue;
|
||||
+
|
||||
+ if (!pte_young(pte[i]))
|
||||
+ continue;
|
||||
+
|
||||
+ page = get_pfn_page(pfn, memcg, pgdat);
|
||||
+ if (!page)
|
||||
+ continue;
|
||||
+
|
||||
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
|
||||
+ VM_WARN_ON_ONCE(true);
|
||||
+
|
||||
+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
||||
+ !(PageAnon(page) && PageSwapBacked(page) &&
|
||||
+ !PageSwapCache(page)))
|
||||
+ set_page_dirty(page);
|
||||
+
|
||||
+ old_gen = page_lru_gen(page);
|
||||
+ if (old_gen < 0)
|
||||
+ SetPageReferenced(page);
|
||||
+ else if (old_gen != new_gen)
|
||||
+ __set_bit(i, bitmap);
|
||||
+ }
|
||||
+
|
||||
+ arch_leave_lazy_mmu_mode();
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
||||
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||
+ page = pte_page(pte[i]);
|
||||
+ activate_page(page);
|
||||
+ }
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* page_update_gen() requires stable page_memcg() */
|
||||
+ if (!mem_cgroup_trylock_pages(memcg))
|
||||
+ return;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
||||
+
|
||||
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||
+ page = compound_head(pte_page(pte[i]));
|
||||
+ if (page_memcg_rcu(page) != memcg)
|
||||
+ continue;
|
||||
+
|
||||
+ old_gen = page_update_gen(page, new_gen);
|
||||
+ if (old_gen < 0 || old_gen == new_gen)
|
||||
+ continue;
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, old_gen, new_gen);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ mem_cgroup_unlock_pages();
|
||||
+}
|
||||
+
|
||||
/******************************************************************************
|
||||
* the eviction
|
||||
******************************************************************************/
|
||||
@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lru
|
||||
return true;
|
||||
}
|
||||
|
||||
+ /* promoted */
|
||||
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
/* protected */
|
||||
if (tier > tier_idx) {
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,496 +0,0 @@
|
|||
From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 25 Jan 2021 21:38:02 -0700
|
||||
Subject: [PATCH 08/10] mm: multigenerational lru: user interface
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
|
||||
multigenerational lru at runtime.
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
|
||||
given number of milliseconds. The OOM killer is invoked if this
|
||||
working set cannot be kept in memory.
|
||||
|
||||
Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
|
||||
invoke the aging and the eviction. This file has the following output:
|
||||
memcg memcg_id memcg_path
|
||||
node node_id
|
||||
min_gen birth_time anon_size file_size
|
||||
...
|
||||
max_gen birth_time anon_size file_size
|
||||
|
||||
min_gen is the oldest generation number and max_gen is the youngest
|
||||
generation number. birth_time is in milliseconds. anon_size and
|
||||
file_size are in pages.
|
||||
|
||||
This file takes the following input:
|
||||
+ memcg_id node_id max_gen [swappiness] [use_bloom_filter]
|
||||
- memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
|
||||
|
||||
The first command line invokes the aging, which scans PTEs for
|
||||
accessed pages and then creates the next generation max_gen+1. A swap
|
||||
file and a non-zero swappiness, which overrides vm.swappiness, are
|
||||
required to scan PTEs mapping anon pages. The second command line
|
||||
invokes the eviction, which evicts generations less than or equal to
|
||||
min_gen. min_gen should be less than max_gen-1 as max_gen and
|
||||
max_gen-1 are not fully aged and therefore cannot be evicted.
|
||||
Setting nr_to_reclaim to N limits the number of pages to evict.
|
||||
Setting use_bloom_filter to 0 overrides the default behavior which
|
||||
only scans PTE tables found populated. Multiple command lines are
|
||||
supported, as is concatenation with delimiters "," and ";".
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
|
||||
---
|
||||
include/linux/nodemask.h | 1 +
|
||||
mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 416 insertions(+)
|
||||
|
||||
--- a/include/linux/nodemask.h
|
||||
+++ b/include/linux/nodemask.h
|
||||
@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
|
||||
#define first_online_node 0
|
||||
#define first_memory_node 0
|
||||
#define next_online_node(nid) (MAX_NUMNODES)
|
||||
+#define next_memory_node(nid) (MAX_NUMNODES)
|
||||
#define nr_node_ids 1U
|
||||
#define nr_online_nodes 1U
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -53,6 +53,8 @@
|
||||
#include <linux/memory.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
+#include <linux/ctype.h>
|
||||
+#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -4817,6 +4819,413 @@ unlock:
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * sysfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ if (kstrtouint(buf, 10, &msecs))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
|
||||
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
|
||||
+);
|
||||
+
|
||||
+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ bool enable;
|
||||
+
|
||||
+ if (kstrtobool(buf, &enable))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ lru_gen_change_state(enable, true, false);
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||
+ enabled, 0644, show_enable, store_enable
|
||||
+);
|
||||
+
|
||||
+static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_min_ttl_attr.attr,
|
||||
+ &lru_gen_enabled_attr.attr,
|
||||
+ NULL
|
||||
+};
|
||||
+
|
||||
+static struct attribute_group lru_gen_attr_group = {
|
||||
+ .name = "lru_gen",
|
||||
+ .attrs = lru_gen_attrs,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * debugfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ loff_t nr_to_skip = *pos;
|
||||
+
|
||||
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
|
||||
+ if (!m->private)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node_state(nid, N_MEMORY) {
|
||||
+ if (!nr_to_skip--)
|
||||
+ return get_lruvec(nid, memcg);
|
||||
+ }
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ if (!IS_ERR_OR_NULL(v))
|
||||
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
|
||||
+
|
||||
+ kvfree(m->private);
|
||||
+ m->private = NULL;
|
||||
+}
|
||||
+
|
||||
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
+{
|
||||
+ int nid = lruvec_pgdat(v)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(v);
|
||||
+
|
||||
+ ++*pos;
|
||||
+
|
||||
+ nid = next_memory_node(nid);
|
||||
+ if (nid == MAX_NUMNODES) {
|
||||
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
|
||||
+ if (!memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ nid = first_memory_node;
|
||||
+ }
|
||||
+
|
||||
+ return get_lruvec(nid, memcg);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||
+ unsigned long max_seq, unsigned long *min_seq,
|
||||
+ unsigned long seq)
|
||||
+{
|
||||
+ int i;
|
||||
+ int type, tier;
|
||||
+ int hist = lru_hist_from_seq(seq);
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
+ seq_printf(m, " %10d", tier);
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long n[3] = {};
|
||||
+
|
||||
+ if (seq == max_seq) {
|
||||
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||
+
|
||||
+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
|
||||
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
+ if (tier)
|
||||
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
|
||||
+
|
||||
+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
|
||||
+ } else
|
||||
+ seq_puts(m, " 0 0 0 ");
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+ }
|
||||
+
|
||||
+ seq_puts(m, " ");
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ if (seq == max_seq && NR_HIST_GENS == 1)
|
||||
+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
|
||||
+ toupper(MM_STAT_CODES[i]));
|
||||
+ else if (seq != max_seq && NR_HIST_GENS > 1)
|
||||
+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
|
||||
+ MM_STAT_CODES[i]);
|
||||
+ else
|
||||
+ seq_puts(m, " 0 ");
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ bool full = !debugfs_real_fops(m->file)->write;
|
||||
+ struct lruvec *lruvec = v;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (nid == first_memory_node) {
|
||||
+ const char *path = memcg ? m->private : "";
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
|
||||
+#endif
|
||||
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " node %5d\n", nid);
|
||||
+
|
||||
+ if (!full)
|
||||
+ seq = min_seq[0];
|
||||
+ else if (max_seq >= MAX_NR_GENS)
|
||||
+ seq = max_seq - MAX_NR_GENS + 1;
|
||||
+ else
|
||||
+ seq = 0;
|
||||
+
|
||||
+ for (; seq <= max_seq; seq++) {
|
||||
+ int gen, type, zone;
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
|
||||
+
|
||||
+ seq_printf(m, " %10lu %10u", seq, msecs);
|
||||
+
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ long size = 0;
|
||||
+
|
||||
+ if (seq < min_seq[type]) {
|
||||
+ seq_puts(m, " -0 ");
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
|
||||
+
|
||||
+ seq_printf(m, " %10lu ", max(size, 0L));
|
||||
+ }
|
||||
+
|
||||
+ seq_putc(m, '\n');
|
||||
+
|
||||
+ if (full)
|
||||
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct seq_operations lru_gen_seq_ops = {
|
||||
+ .start = lru_gen_seq_start,
|
||||
+ .stop = lru_gen_seq_stop,
|
||||
+ .next = lru_gen_seq_next,
|
||||
+ .show = lru_gen_seq_show,
|
||||
+};
|
||||
+
|
||||
+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ unsigned long seq, bool use_filter)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq == max_seq)
|
||||
+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
|
||||
+
|
||||
+ return seq > max_seq ? -EINVAL : 0;
|
||||
+}
|
||||
+
|
||||
+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ unsigned long seq, unsigned long nr_to_reclaim)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+ int err = -EINTR;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq >= max_seq - 1)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ while (!signal_pending(current)) {
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
|
||||
+ !evict_pages(lruvec, sc, swappiness)) {
|
||||
+ err = 0;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ }
|
||||
+
|
||||
+ blk_finish_plug(&plug);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
|
||||
+ int swappiness, unsigned long seq, unsigned long opt)
|
||||
+{
|
||||
+ struct lruvec *lruvec;
|
||||
+ int err = -EINVAL;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (!mem_cgroup_disabled()) {
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_id(memcg_id);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg && !css_tryget(&memcg->css))
|
||||
+ memcg = NULL;
|
||||
+#endif
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (!memcg)
|
||||
+ goto done;
|
||||
+ }
|
||||
+ if (memcg_id != mem_cgroup_id(memcg))
|
||||
+ goto done;
|
||||
+
|
||||
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
|
||||
+ goto done;
|
||||
+
|
||||
+ lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (swappiness < 0)
|
||||
+ swappiness = get_swappiness(memcg);
|
||||
+ else if (swappiness > 200)
|
||||
+ goto done;
|
||||
+
|
||||
+ switch (cmd) {
|
||||
+ case '+':
|
||||
+ err = run_aging(lruvec, sc, swappiness, seq, opt);
|
||||
+ break;
|
||||
+ case '-':
|
||||
+ err = run_eviction(lruvec, sc, swappiness, seq, opt);
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||
+ size_t len, loff_t *pos)
|
||||
+{
|
||||
+ void *buf;
|
||||
+ char *cur, *next;
|
||||
+ unsigned int flags;
|
||||
+ int err = 0;
|
||||
+ struct scan_control sc = {
|
||||
+ .may_writepage = 1,
|
||||
+ .may_unmap = 1,
|
||||
+ .may_swap = 1,
|
||||
+ .reclaim_idx = MAX_NR_ZONES - 1,
|
||||
+ .gfp_mask = GFP_KERNEL,
|
||||
+ };
|
||||
+
|
||||
+ buf = kvmalloc(len + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ if (copy_from_user(buf, src, len)) {
|
||||
+ kvfree(buf);
|
||||
+ return -EFAULT;
|
||||
+ }
|
||||
+
|
||||
+ next = buf;
|
||||
+ next[len] = '\0';
|
||||
+
|
||||
+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
|
||||
+ if (!sc.reclaim_state.mm_walk_args) {
|
||||
+ kvfree(buf);
|
||||
+ return -ENOMEM;
|
||||
+ }
|
||||
+
|
||||
+ flags = memalloc_noreclaim_save();
|
||||
+ set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
+
|
||||
+ while ((cur = strsep(&next, ",;\n"))) {
|
||||
+ int n;
|
||||
+ int end;
|
||||
+ char cmd;
|
||||
+ unsigned int memcg_id;
|
||||
+ unsigned int nid;
|
||||
+ unsigned long seq;
|
||||
+ unsigned int swappiness = -1;
|
||||
+ unsigned long opt = -1;
|
||||
+
|
||||
+ cur = skip_spaces(cur);
|
||||
+ if (!*cur)
|
||||
+ continue;
|
||||
+
|
||||
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
|
||||
+ &seq, &end, &swappiness, &end, &opt, &end);
|
||||
+ if (n < 4 || cur[end]) {
|
||||
+ err = -EINVAL;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
|
||||
+ if (err)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ set_task_reclaim_state(current, NULL);
|
||||
+ memalloc_noreclaim_restore(flags);
|
||||
+
|
||||
+ free_mm_walk_args(sc.reclaim_state.mm_walk_args);
|
||||
+ kvfree(buf);
|
||||
+
|
||||
+ return err ? : len;
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ return seq_open(file, &lru_gen_seq_ops);
|
||||
+}
|
||||
+
|
||||
+static const struct file_operations lru_gen_rw_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .write = lru_gen_seq_write,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+static const struct file_operations lru_gen_ro_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -4886,6 +5295,12 @@ static int __init init_lru_gen(void)
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
||||
|
||||
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
+ pr_err("lru_gen: failed to create sysfs group\n");
|
||||
+
|
||||
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
|
||||
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
|
@ -0,0 +1,315 @@
|
|||
From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:06 -0600
|
||||
Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When multiple memcgs are available, it is possible to use generations as a
|
||||
frame of reference to make better choices and improve overall performance
|
||||
under global memory pressure. This patch adds a basic optimization to
|
||||
select memcgs that can drop single-use unmapped clean pages first. Doing
|
||||
so reduces the chance of going into the aging path or swapping, which can
|
||||
be costly.
|
||||
|
||||
A typical example that benefits from this optimization is a server running
|
||||
mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
|
||||
buffered I/O workload in the other.
|
||||
|
||||
Though this optimization can be applied to both kswapd and direct reclaim,
|
||||
it is only added to kswapd to keep the patchset manageable. Later
|
||||
improvements may cover the direct reclaim path.
|
||||
|
||||
While ensuring certain fairness to all eligible memcgs, proportional scans
|
||||
of individual memcgs also require proper backoff to avoid overshooting
|
||||
their aggregate reclaim target by too much. Otherwise it can cause high
|
||||
direct reclaim latency. The conditions for backoff are:
|
||||
|
||||
1. At low priorities, for direct reclaim, if aging fairness or direct
|
||||
reclaim latency is at risk, i.e., aging one memcg multiple times or
|
||||
swapping after the target is met.
|
||||
2. At high priorities, for global reclaim, if per-zone free pages are
|
||||
above respective watermarks.
|
||||
|
||||
Server benchmark results:
|
||||
Mixed workloads:
|
||||
fio (buffered I/O): +[19, 21]%
|
||||
IOPS BW
|
||||
patch1-8: 1880k 7343MiB/s
|
||||
patch1-9: 2252k 8796MiB/s
|
||||
|
||||
memcached (anon): +[119, 123]%
|
||||
Ops/sec KB/sec
|
||||
patch1-8: 862768.65 33514.68
|
||||
patch1-9: 1911022.12 74234.54
|
||||
|
||||
Mixed workloads:
|
||||
fio (buffered I/O): +[75, 77]%
|
||||
IOPS BW
|
||||
5.19-rc1: 1279k 4996MiB/s
|
||||
patch1-9: 2252k 8796MiB/s
|
||||
|
||||
memcached (anon): +[13, 15]%
|
||||
Ops/sec KB/sec
|
||||
5.19-rc1: 1673524.04 65008.87
|
||||
patch1-9: 1911022.12 74234.54
|
||||
|
||||
Configurations:
|
||||
(changes since patch 6)
|
||||
|
||||
cat mixed.sh
|
||||
modprobe brd rd_nr=2 rd_size=56623104
|
||||
|
||||
swapoff -a
|
||||
mkswap /dev/ram0
|
||||
swapon /dev/ram0
|
||||
|
||||
mkfs.ext4 /dev/ram1
|
||||
mount -t ext4 /dev/ram1 /mnt
|
||||
|
||||
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
|
||||
--ratio 1:0 --pipeline 8 -d 2000
|
||||
|
||||
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
|
||||
--buffered=1 --ioengine=io_uring --iodepth=128 \
|
||||
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
|
||||
--rw=randread --random_distribution=random --norandommap \
|
||||
--time_based --ramp_time=10m --runtime=90m --group_reporting &
|
||||
pid=$!
|
||||
|
||||
sleep 200
|
||||
|
||||
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
|
||||
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
|
||||
|
||||
kill -INT $pid
|
||||
wait
|
||||
|
||||
Client benchmark results:
|
||||
no change (CONFIG_MEMCG=n)
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
1 file changed, 96 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -127,6 +127,12 @@ struct scan_control {
|
||||
/* Always discard instead of demoting to lower tier memory */
|
||||
unsigned int no_demotion:1;
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* help kswapd make better choices among multiple memcgs */
|
||||
+ unsigned int memcgs_need_aging:1;
|
||||
+ unsigned long last_reclaimed;
|
||||
+#endif
|
||||
+
|
||||
/* Allocation order */
|
||||
s8 order;
|
||||
|
||||
@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
+ sc->last_reclaimed = sc->nr_reclaimed;
|
||||
+
|
||||
+ /*
|
||||
+ * To reduce the chance of going into the aging path, which can be
|
||||
+ * costly, optimistically skip it if the flag below was cleared in the
|
||||
+ * eviction path. This improves the overall performance when multiple
|
||||
+ * memcgs are available.
|
||||
+ */
|
||||
+ if (!sc->memcgs_need_aging) {
|
||||
+ sc->memcgs_need_aging = true;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
set_mm_walk(pgdat);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
|
||||
return scanned;
|
||||
}
|
||||
|
||||
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ bool *need_swapping)
|
||||
{
|
||||
int type;
|
||||
int scanned;
|
||||
@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
|
||||
|
||||
sc->nr_reclaimed += reclaimed;
|
||||
|
||||
+ if (need_swapping && type == LRU_GEN_ANON)
|
||||
+ *need_swapping = true;
|
||||
+
|
||||
return scanned;
|
||||
}
|
||||
|
||||
@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
|
||||
* reclaim.
|
||||
*/
|
||||
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap)
|
||||
+ bool can_swap, bool *need_aging)
|
||||
{
|
||||
- bool need_aging;
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
|
||||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
return 0;
|
||||
|
||||
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
- if (!need_aging)
|
||||
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
+ if (!*need_aging)
|
||||
return nr_to_scan;
|
||||
|
||||
/* skip the aging path at the default priority */
|
||||
@@ -4715,10 +4737,68 @@ done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
}
|
||||
|
||||
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||
+ struct scan_control *sc, bool need_swapping)
|
||||
+{
|
||||
+ int i;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (!current_is_kswapd()) {
|
||||
+ /* age each memcg once to ensure fairness */
|
||||
+ if (max_seq - seq > 1)
|
||||
+ return true;
|
||||
+
|
||||
+ /* over-swapping can increase allocation latency */
|
||||
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||
+ return true;
|
||||
+
|
||||
+ /* give this thread a chance to exit and free its memory */
|
||||
+ if (fatal_signal_pending(current)) {
|
||||
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ if (cgroup_reclaim(sc))
|
||||
+ return false;
|
||||
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||
+ return false;
|
||||
+
|
||||
+ /* keep scanning at low priorities to ensure fairness */
|
||||
+ if (sc->priority > DEF_PRIORITY - 2)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * A minimum amount of work was done under global memory pressure. For
|
||||
+ * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
||||
+ * met, and yet the allocation may still succeed, since kswapd may have
|
||||
+ * caught up. In either case, it's better to stop now, and restart if
|
||||
+ * necessary.
|
||||
+ */
|
||||
+ for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||
+ unsigned long wmark;
|
||||
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||
+
|
||||
+ if (!managed_zone(zone))
|
||||
+ continue;
|
||||
+
|
||||
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
+ bool need_aging = false;
|
||||
+ bool need_swapping = false;
|
||||
unsigned long scanned = 0;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
|
||||
else
|
||||
swappiness = 0;
|
||||
|
||||
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||
if (!nr_to_scan)
|
||||
- break;
|
||||
+ goto done;
|
||||
|
||||
- delta = evict_pages(lruvec, sc, swappiness);
|
||||
+ delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
||||
if (!delta)
|
||||
- break;
|
||||
+ goto done;
|
||||
|
||||
scanned += delta;
|
||||
if (scanned >= nr_to_scan)
|
||||
break;
|
||||
|
||||
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||
+ break;
|
||||
+
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
+ /* see the comment in lru_gen_age_node() */
|
||||
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||
+ sc->memcgs_need_aging = false;
|
||||
+done:
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
|
@ -1,80 +0,0 @@
|
|||
From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 25 Jan 2021 21:47:24 -0700
|
||||
Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig
|
||||
|
||||
Add configuration options for the multigenerational lru.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635
|
||||
---
|
||||
mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 59 insertions(+)
|
||||
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -899,4 +899,63 @@ config SECRETMEM
|
||||
|
||||
source "mm/damon/Kconfig"
|
||||
|
||||
+# the multigenerational lru {
|
||||
+config LRU_GEN
|
||||
+ bool "Multigenerational LRU"
|
||||
+ depends on MMU
|
||||
+ # the following options may leave not enough spare bits in page->flags
|
||||
+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
|
||||
+ help
|
||||
+ A high performance LRU implementation to heavily overcommit workloads
|
||||
+ that are not IO bound. See Documentation/vm/multigen_lru.rst for
|
||||
+ details.
|
||||
+
|
||||
+ Warning: do not enable this option unless you plan to use it because
|
||||
+ it introduces a small per-process and per-memcg and per-node memory
|
||||
+ overhead.
|
||||
+
|
||||
+config LRU_GEN_ENABLED
|
||||
+ bool "Turn on by default"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
|
||||
+ changes it to 1.
|
||||
+
|
||||
+ Warning: the default value is the fast path. See
|
||||
+ Documentation/static-keys.txt for details.
|
||||
+
|
||||
+config LRU_GEN_STATS
|
||||
+ bool "Full stats for debugging"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ This option keeps full stats for each generation, which can be read
|
||||
+ from /sys/kernel/debug/lru_gen_full.
|
||||
+
|
||||
+ Warning: do not enable this option unless you plan to use it because
|
||||
+ it introduces an additional small per-process and per-memcg and
|
||||
+ per-node memory overhead.
|
||||
+
|
||||
+config NR_LRU_GENS
|
||||
+ int "Max number of generations"
|
||||
+ depends on LRU_GEN
|
||||
+ range 4 31
|
||||
+ default 7
|
||||
+ help
|
||||
+ This will use order_base_2(N+1) spare bits from page flags.
|
||||
+
|
||||
+ Warning: do not use numbers larger than necessary because each
|
||||
+ generation introduces a small per-node and per-memcg memory overhead.
|
||||
+
|
||||
+config TIERS_PER_GEN
|
||||
+ int "Number of tiers per generation"
|
||||
+ depends on LRU_GEN
|
||||
+ range 2 5
|
||||
+ default 4
|
||||
+ help
|
||||
+ This will use N-2 spare bits from page flags.
|
||||
+
|
||||
+ Larger values generally offer better protection to active pages under
|
||||
+ heavy buffered I/O workloads.
|
||||
+# }
|
||||
+
|
||||
endmenu
|
|
@ -0,0 +1,498 @@
|
|||
From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:07 -0600
|
||||
Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
|
||||
can be disabled include:
|
||||
0x0001: the multi-gen LRU core
|
||||
0x0002: walking page table, when arch_has_hw_pte_young() returns
|
||||
true
|
||||
0x0004: clearing the accessed bit in non-leaf PMD entries, when
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
|
||||
[yYnN]: apply to all the components above
|
||||
E.g.,
|
||||
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0007
|
||||
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0005
|
||||
|
||||
NB: the page table walks happen on the scale of seconds under heavy memory
|
||||
pressure, in which case the mmap_lock contention is a lesser concern,
|
||||
compared with the LRU lock contention and the I/O congestion. So far the
|
||||
only well-known case of the mmap_lock contention happens on Android, due
|
||||
to Scudo [1] which allocates several thousand VMAs for merely a few
|
||||
hundred MBs. The SPF and the Maple Tree also have provided their own
|
||||
assessments [2][3]. However, if walking page tables does worsen the
|
||||
mmap_lock contention, the kill switch can be used to disable it. In this
|
||||
case the multi-gen LRU will suffer a minor performance degradation, as
|
||||
shown previously.
|
||||
|
||||
Clearing the accessed bit in non-leaf PMD entries can also be disabled,
|
||||
since this behavior was not tested on x86 varieties other than Intel and
|
||||
AMD.
|
||||
|
||||
[1] https://source.android.com/devices/tech/debug/scudo
|
||||
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
|
||||
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/cgroup.h | 15 ++-
|
||||
include/linux/mm_inline.h | 15 ++-
|
||||
include/linux/mmzone.h | 9 ++
|
||||
kernel/cgroup/cgroup-internal.h | 1 -
|
||||
mm/Kconfig | 6 +
|
||||
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
|
||||
6 files changed, 265 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/include/linux/cgroup.h
|
||||
+++ b/include/linux/cgroup.h
|
||||
@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
|
||||
css_put(&cgrp->self);
|
||||
}
|
||||
|
||||
+extern struct mutex cgroup_mutex;
|
||||
+
|
||||
+static inline void cgroup_lock(void)
|
||||
+{
|
||||
+ mutex_lock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
+static inline void cgroup_unlock(void)
|
||||
+{
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||
* @task: the task to obtain css_set for
|
||||
@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
|
||||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
@@ -708,6 +719,8 @@ struct cgroup;
|
||||
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
||||
static inline void css_get(struct cgroup_subsys_state *css) {}
|
||||
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||
+static inline void cgroup_lock(void) {}
|
||||
+static inline void cgroup_unlock(void) {}
|
||||
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||
struct task_struct *t) { return 0; }
|
||||
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -91,10 +91,21 @@ static __always_inline enum lru_list pag
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
static inline bool lru_gen_enabled(void)
|
||||
{
|
||||
- return true;
|
||||
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
+
|
||||
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+}
|
||||
+#else
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
+
|
||||
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
}
|
||||
+#endif
|
||||
|
||||
static inline bool lru_gen_in_fault(void)
|
||||
{
|
||||
@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(stru
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
|
||||
- if (PageUnevictable(page))
|
||||
+ if (PageUnevictable(page) || !lrugen->enabled)
|
||||
return false;
|
||||
/*
|
||||
* There are three common cases for this page:
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -364,6 +364,13 @@ enum {
|
||||
LRU_GEN_FILE,
|
||||
};
|
||||
|
||||
+enum {
|
||||
+ LRU_GEN_CORE,
|
||||
+ LRU_GEN_MM_WALK,
|
||||
+ LRU_GEN_NONLEAF_YOUNG,
|
||||
+ NR_LRU_GEN_CAPS
|
||||
+};
|
||||
+
|
||||
#define MIN_LRU_BATCH BITS_PER_LONG
|
||||
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
||||
|
||||
@@ -405,6 +412,8 @@ struct lru_gen_struct {
|
||||
/* can be modified without holding the LRU lock */
|
||||
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
+ /* whether the multi-gen LRU is enabled */
|
||||
+ bool enabled;
|
||||
};
|
||||
|
||||
enum {
|
||||
--- a/kernel/cgroup/cgroup-internal.h
|
||||
+++ b/kernel/cgroup/cgroup-internal.h
|
||||
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
|
||||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -906,6 +906,12 @@ config LRU_GEN
|
||||
help
|
||||
A high performance LRU implementation to overcommit memory.
|
||||
|
||||
+config LRU_GEN_ENABLED
|
||||
+ bool "Enable by default"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ This option enables the multi-gen LRU by default.
|
||||
+
|
||||
config LRU_GEN_STATS
|
||||
bool "Full stats for debugging"
|
||||
depends on LRU_GEN
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -52,6 +52,7 @@
|
||||
#include <linux/psi.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
+#include <linux/ctype.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pg
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
|
||||
+#else
|
||||
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
|
||||
+#endif
|
||||
+
|
||||
/******************************************************************************
|
||||
* shorthand helpers
|
||||
******************************************************************************/
|
||||
@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
||||
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
}
|
||||
@@ -3815,10 +3825,12 @@ restart:
|
||||
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
- if (!pmd_young(val))
|
||||
- continue;
|
||||
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
+ if (!pmd_young(val))
|
||||
+ continue;
|
||||
|
||||
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
+ }
|
||||
#endif
|
||||
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||
continue;
|
||||
@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!arch_has_hw_pte_young()) {
|
||||
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@@ -4846,6 +4858,208 @@ done:
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * state change
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ if (lrugen->enabled) {
|
||||
+ enum lru_list lru;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ if (!list_empty(&lruvec->lists[lru]))
|
||||
+ return false;
|
||||
+ }
|
||||
+ } else {
|
||||
+ int gen, type, zone;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool fill_evictable(struct lruvec *lruvec)
|
||||
+{
|
||||
+ enum lru_list lru;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ int type = is_file_lru(lru);
|
||||
+ bool active = is_active_lru(lru);
|
||||
+ struct list_head *head = &lruvec->lists[lru];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
|
||||
+
|
||||
+ del_page_from_lru_list(page, lruvec);
|
||||
+ success = lru_gen_add_page(lruvec, page, false);
|
||||
+ VM_WARN_ON_ONCE(!success);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool drain_evictable(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
+
|
||||
+ success = lru_gen_del_page(lruvec, page, false);
|
||||
+ VM_WARN_ON_ONCE(!success);
|
||||
+ add_page_to_lru_list(page, lruvec);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_change_state(bool enabled)
|
||||
+{
|
||||
+ static DEFINE_MUTEX(state_mutex);
|
||||
+
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ cgroup_lock();
|
||||
+ cpus_read_lock();
|
||||
+ get_online_mems();
|
||||
+ mutex_lock(&state_mutex);
|
||||
+
|
||||
+ if (enabled == lru_gen_enabled())
|
||||
+ goto unlock;
|
||||
+
|
||||
+ if (enabled)
|
||||
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+ else
|
||||
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
|
||||
+
|
||||
+ lruvec->lrugen.enabled = enabled;
|
||||
+
|
||||
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+unlock:
|
||||
+ mutex_unlock(&state_mutex);
|
||||
+ put_online_mems();
|
||||
+ cpus_read_unlock();
|
||||
+ cgroup_unlock();
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * sysfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ unsigned int caps = 0;
|
||||
+
|
||||
+ if (get_cap(LRU_GEN_CORE))
|
||||
+ caps |= BIT(LRU_GEN_CORE);
|
||||
+
|
||||
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
+ caps |= BIT(LRU_GEN_MM_WALK);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
+
|
||||
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ int i;
|
||||
+ unsigned int caps;
|
||||
+
|
||||
+ if (tolower(*buf) == 'n')
|
||||
+ caps = 0;
|
||||
+ else if (tolower(*buf) == 'y')
|
||||
+ caps = -1;
|
||||
+ else if (kstrtouint(buf, 0, &caps))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
||||
+ bool enabled = caps & BIT(i);
|
||||
+
|
||||
+ if (i == LRU_GEN_CORE)
|
||||
+ lru_gen_change_state(enabled);
|
||||
+ else if (enabled)
|
||||
+ static_branch_enable(&lru_gen_caps[i]);
|
||||
+ else
|
||||
+ static_branch_disable(&lru_gen_caps[i]);
|
||||
+ }
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||
+ enabled, 0644, show_enabled, store_enabled
|
||||
+);
|
||||
+
|
||||
+static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_enabled_attr.attr,
|
||||
+ NULL
|
||||
+};
|
||||
+
|
||||
+static struct attribute_group lru_gen_attr_group = {
|
||||
+ .name = "lru_gen",
|
||||
+ .attrs = lru_gen_attrs,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+ lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void)
|
||||
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
|
||||
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
+ pr_err("lru_gen: failed to create sysfs group\n");
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
|
@ -1,161 +0,0 @@
|
|||
From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Tue, 2 Feb 2021 01:27:45 -0700
|
||||
Subject: [PATCH 10/10] mm: multigenerational lru: documentation
|
||||
|
||||
Add Documentation/vm/multigen_lru.rst.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
|
||||
---
|
||||
Documentation/vm/index.rst | 1 +
|
||||
Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++
|
||||
2 files changed, 133 insertions(+)
|
||||
create mode 100644 Documentation/vm/multigen_lru.rst
|
||||
|
||||
--- a/Documentation/vm/index.rst
|
||||
+++ b/Documentation/vm/index.rst
|
||||
@@ -17,6 +17,7 @@ various features of the Linux memory man
|
||||
|
||||
swap_numa
|
||||
zswap
|
||||
+ multigen_lru
|
||||
|
||||
Kernel developers MM documentation
|
||||
==================================
|
||||
--- /dev/null
|
||||
+++ b/Documentation/vm/multigen_lru.rst
|
||||
@@ -0,0 +1,132 @@
|
||||
+.. SPDX-License-Identifier: GPL-2.0
|
||||
+
|
||||
+=====================
|
||||
+Multigenerational LRU
|
||||
+=====================
|
||||
+
|
||||
+Quick Start
|
||||
+===========
|
||||
+Build Configurations
|
||||
+--------------------
|
||||
+:Required: Set ``CONFIG_LRU_GEN=y``.
|
||||
+
|
||||
+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
|
||||
+ default.
|
||||
+
|
||||
+Runtime Configurations
|
||||
+----------------------
|
||||
+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
|
||||
+ feature was not turned on by default.
|
||||
+
|
||||
+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to
|
||||
+ protect the working set of ``N`` milliseconds. The OOM killer is
|
||||
+ invoked if this working set cannot be kept in memory.
|
||||
+
|
||||
+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature
|
||||
+ is turned on. This file has the following output:
|
||||
+
|
||||
+::
|
||||
+
|
||||
+ memcg memcg_id memcg_path
|
||||
+ node node_id
|
||||
+ min_gen birth_time anon_size file_size
|
||||
+ ...
|
||||
+ max_gen birth_time anon_size file_size
|
||||
+
|
||||
+``min_gen`` is the oldest generation number and ``max_gen`` is the
|
||||
+youngest generation number. ``birth_time`` is in milliseconds.
|
||||
+``anon_size`` and ``file_size`` are in pages.
|
||||
+
|
||||
+Phones/Laptops/Workstations
|
||||
+---------------------------
|
||||
+No additional configurations required.
|
||||
+
|
||||
+Servers/Data Centers
|
||||
+--------------------
|
||||
+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a
|
||||
+ larger number.
|
||||
+
|
||||
+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger
|
||||
+ number.
|
||||
+
|
||||
+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``.
|
||||
+
|
||||
+:Working set estimation: Write ``+ memcg_id node_id max_gen
|
||||
+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to
|
||||
+ invoke the aging, which scans PTEs for accessed pages and then
|
||||
+ creates the next generation ``max_gen+1``. A swap file and a non-zero
|
||||
+ ``swappiness``, which overrides ``vm.swappiness``, are required to
|
||||
+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to
|
||||
+ override the default behavior which only scans PTE tables found
|
||||
+ populated.
|
||||
+
|
||||
+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness]
|
||||
+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the
|
||||
+ eviction, which evicts generations less than or equal to ``min_gen``.
|
||||
+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and
|
||||
+ ``max_gen-1`` are not fully aged and therefore cannot be evicted.
|
||||
+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
|
||||
+ command lines are supported, so does concatenation with delimiters
|
||||
+ ``,`` and ``;``.
|
||||
+
|
||||
+Framework
|
||||
+=========
|
||||
+For each ``lruvec``, evictable pages are divided into multiple
|
||||
+generations. The youngest generation number is stored in
|
||||
+``lrugen->max_seq`` for both anon and file types as they are aged on
|
||||
+an equal footing. The oldest generation numbers are stored in
|
||||
+``lrugen->min_seq[]`` separately for anon and file types as clean
|
||||
+file pages can be evicted regardless of swap and writeback
|
||||
+constraints. These three variables are monotonically increasing.
|
||||
+Generation numbers are truncated into
|
||||
+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
|
||||
+``page->flags``. The sliding window technique is used to prevent
|
||||
+truncated generation numbers from overlapping. Each truncated
|
||||
+generation number is an index to an array of per-type and per-zone
|
||||
+lists ``lrugen->lists``.
|
||||
+
|
||||
+Each generation is divided into multiple tiers. Tiers represent
|
||||
+different ranges of numbers of accesses from file descriptors only.
|
||||
+Pages accessed ``N`` times via file descriptors belong to tier
|
||||
+``order_base_2(N)``. Each generation contains at most
|
||||
+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional
|
||||
+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to
|
||||
+moving between generations which requires list operations, moving
|
||||
+between tiers only involves operations on ``page->flags`` and
|
||||
+therefore has a negligible cost. A feedback loop modeled after the PID
|
||||
+controller monitors refaulted % across all tiers and decides when to
|
||||
+protect pages from which tiers.
|
||||
+
|
||||
+The framework comprises two conceptually independent components: the
|
||||
+aging and the eviction, which can be invoked separately from user
|
||||
+space for the purpose of working set estimation and proactive reclaim.
|
||||
+
|
||||
+Aging
|
||||
+-----
|
||||
+The aging produces young generations. Given an ``lruvec``, the aging
|
||||
+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()``
|
||||
+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained
|
||||
+for each ``memcg``). Upon finding one, the aging updates its
|
||||
+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``).
|
||||
+After each round of traversal, the aging increments ``max_seq``. The
|
||||
+aging is due when ``min_seq[]`` reaches ``max_seq-1``.
|
||||
+
|
||||
+Eviction
|
||||
+--------
|
||||
+The eviction consumes old generations. Given an ``lruvec``, the
|
||||
+eviction scans pages on the per-zone lists indexed by anon and file
|
||||
+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to
|
||||
+select a type based on the values of ``min_seq[]``. If they are
|
||||
+equal, it selects the type that has a lower refaulted %. The eviction
|
||||
+sorts a page according to its updated generation number if the aging
|
||||
+has found this page accessed. It also moves a page to the next
|
||||
+generation if this page is from an upper tier that has a higher
|
||||
+refaulted % than the base tier. The eviction increments ``min_seq[]``
|
||||
+of a selected type when it finds all the per-zone lists indexed by
|
||||
+``min_seq[]`` of this selected type are empty.
|
||||
+
|
||||
+To-do List
|
||||
+==========
|
||||
+KVM Optimization
|
||||
+----------------
|
||||
+Support shadow page table walk.
|
|
@ -0,0 +1,226 @@
|
|||
From 73d1ff551760f0c79c47ab70faa4c2ca91413f5c Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:08 -0600
|
||||
Subject: [PATCH 11/29] mm: multi-gen LRU: thrashing prevention
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
|
||||
requested by many desktop users [1].
|
||||
|
||||
When set to value N, it prevents the working set of N milliseconds from
|
||||
getting evicted. The OOM killer is triggered if this working set cannot
|
||||
be kept in memory. Based on the average human detectable lag (~100ms),
|
||||
N=1000 usually eliminates intolerable lags due to thrashing. Larger
|
||||
values like N=3000 make lags less noticeable at the risk of premature OOM
|
||||
kills.
|
||||
|
||||
Compared with the size-based approach [2], this time-based approach
|
||||
has the following advantages:
|
||||
|
||||
1. It is easier to configure because it is agnostic to applications
|
||||
and memory sizes.
|
||||
2. It is more reliable because it is directly wired to the OOM killer.
|
||||
|
||||
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
|
||||
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-12-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mmzone.h | 2 ++
|
||||
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
|
||||
2 files changed, 73 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -399,6 +399,8 @@ struct lru_gen_struct {
|
||||
unsigned long max_seq;
|
||||
/* the eviction increments the oldest generation numbers */
|
||||
unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the birth time of each generation in jiffies */
|
||||
+ unsigned long timestamps[MAX_NR_GENS];
|
||||
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4064,6 +4064,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
for (type = 0; type < ANON_AND_FILE; type++)
|
||||
reset_ctrl_pos(lruvec, type, false);
|
||||
|
||||
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
|
||||
/* make sure preceding modifications appear */
|
||||
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
||||
|
||||
@@ -4193,7 +4194,7 @@ static bool should_run_aging(struct lruv
|
||||
return false;
|
||||
}
|
||||
|
||||
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||
{
|
||||
bool need_aging;
|
||||
unsigned long nr_to_scan;
|
||||
@@ -4207,16 +4208,36 @@ static void age_lruvec(struct lruvec *lr
|
||||
mem_cgroup_calculate_protection(NULL, memcg);
|
||||
|
||||
if (mem_cgroup_below_min(memcg))
|
||||
- return;
|
||||
+ return false;
|
||||
|
||||
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||
+
|
||||
+ if (min_ttl) {
|
||||
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+
|
||||
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||
+ return false;
|
||||
+
|
||||
+ /* the size is likely too small to be helpful */
|
||||
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
if (need_aging)
|
||||
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
|
||||
+/* to protect the working set of the last N jiffies */
|
||||
+static unsigned long lru_gen_min_ttl __read_mostly;
|
||||
+
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
+ bool success = false;
|
||||
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
@@ -4239,12 +4260,32 @@ static void lru_gen_age_node(struct pgli
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
- age_lruvec(lruvec, sc);
|
||||
+ if (age_lruvec(lruvec, sc, min_ttl))
|
||||
+ success = true;
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
clear_mm_walk();
|
||||
+
|
||||
+ /* check the order to exclude compaction-induced reclaim */
|
||||
+ if (success || !min_ttl || sc->order)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * The main goal is to OOM kill if every generation from all memcgs is
|
||||
+ * younger than min_ttl. However, another possibility is all memcgs are
|
||||
+ * either below min or empty.
|
||||
+ */
|
||||
+ if (mutex_trylock(&oom_lock)) {
|
||||
+ struct oom_control oc = {
|
||||
+ .gfp_mask = sc->gfp_mask,
|
||||
+ };
|
||||
+
|
||||
+ out_of_memory(&oc);
|
||||
+
|
||||
+ mutex_unlock(&oom_lock);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5002,6 +5043,28 @@ unlock:
|
||||
* sysfs interface
|
||||
******************************************************************************/
|
||||
|
||||
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ if (kstrtouint(buf, 0, &msecs))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
|
||||
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
|
||||
+);
|
||||
+
|
||||
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
unsigned int caps = 0;
|
||||
@@ -5050,6 +5113,7 @@ static struct kobj_attribute lru_gen_ena
|
||||
);
|
||||
|
||||
static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_min_ttl_attr.attr,
|
||||
&lru_gen_enabled_attr.attr,
|
||||
NULL
|
||||
};
|
||||
@@ -5065,12 +5129,16 @@ static struct attribute_group lru_gen_at
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
+ int i;
|
||||
int gen, type, zone;
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
|
||||
+ lrugen->timestamps[i] = jiffies;
|
||||
+
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
|
|
@ -0,0 +1,579 @@
|
|||
From 530716d008ca26315f246cd70dc1cefc636beaa4 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:09 -0600
|
||||
Subject: [PATCH 12/29] mm: multi-gen LRU: debugfs interface
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
|
||||
reclaim. These techniques are commonly used to optimize job scheduling
|
||||
(bin packing) in data centers [1][2].
|
||||
|
||||
Compared with the page table-based approach and the PFN-based
|
||||
approach, this lruvec-based approach has the following advantages:
|
||||
1. It offers better choices because it is aware of memcgs, NUMA nodes,
|
||||
shared mappings and unmapped page cache.
|
||||
2. It is more scalable because it is O(nr_hot_pages), whereas the
|
||||
PFN-based approach is O(nr_total_pages).
|
||||
|
||||
Add /sys/kernel/debug/lru_gen_full for debugging.
|
||||
|
||||
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/nodemask.h | 1 +
|
||||
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 402 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/include/linux/nodemask.h
|
||||
+++ b/include/linux/nodemask.h
|
||||
@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
|
||||
#define first_online_node 0
|
||||
#define first_memory_node 0
|
||||
#define next_online_node(nid) (MAX_NUMNODES)
|
||||
+#define next_memory_node(nid) (MAX_NUMNODES)
|
||||
#define nr_node_ids 1U
|
||||
#define nr_online_nodes 1U
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -53,6 +53,7 @@
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
+#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -3968,12 +3969,40 @@ static void clear_mm_walk(void)
|
||||
kfree(walk);
|
||||
}
|
||||
|
||||
-static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
|
||||
{
|
||||
+ int zone;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
+
|
||||
+ if (type == LRU_GEN_ANON && !can_swap)
|
||||
+ goto done;
|
||||
+
|
||||
+ /* prevent cold/hot inversion if force_scan is true */
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
+ new_gen = page_inc_gen(lruvec, page, false);
|
||||
+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+done:
|
||||
reset_ctrl_pos(lruvec, type, true);
|
||||
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
|
||||
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
|
||||
@@ -4019,7 +4048,7 @@ next:
|
||||
return success;
|
||||
}
|
||||
|
||||
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
|
||||
{
|
||||
int prev, next;
|
||||
int type, zone;
|
||||
@@ -4033,9 +4062,13 @@ static void inc_max_seq(struct lruvec *l
|
||||
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
|
||||
continue;
|
||||
|
||||
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
|
||||
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
|
||||
|
||||
- inc_min_seq(lruvec, type);
|
||||
+ while (!inc_min_seq(lruvec, type, can_swap)) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4072,7 +4105,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
}
|
||||
|
||||
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||
- struct scan_control *sc, bool can_swap)
|
||||
+ struct scan_control *sc, bool can_swap, bool force_scan)
|
||||
{
|
||||
bool success;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
@@ -4093,7 +4126,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@@ -4107,7 +4140,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
walk->lruvec = lruvec;
|
||||
walk->max_seq = max_seq;
|
||||
walk->can_swap = can_swap;
|
||||
- walk->force_scan = false;
|
||||
+ walk->force_scan = force_scan;
|
||||
|
||||
do {
|
||||
success = iterate_mm_list(lruvec, walk, &mm);
|
||||
@@ -4127,7 +4160,7 @@ done:
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
|
||||
- inc_max_seq(lruvec, can_swap);
|
||||
+ inc_max_seq(lruvec, can_swap, force_scan);
|
||||
/* either this sees any waiters or they will see updated max_seq */
|
||||
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
wake_up_all(&lruvec->mm_state.wait);
|
||||
@@ -4225,7 +4258,7 @@ static bool age_lruvec(struct lruvec *lr
|
||||
}
|
||||
|
||||
if (need_aging)
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -4784,7 +4817,7 @@ static unsigned long get_nr_to_scan(stru
|
||||
if (current_is_kswapd())
|
||||
return 0;
|
||||
|
||||
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
|
||||
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||
return nr_to_scan;
|
||||
done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
@@ -5124,6 +5157,361 @@ static struct attribute_group lru_gen_at
|
||||
};
|
||||
|
||||
/******************************************************************************
|
||||
+ * debugfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ loff_t nr_to_skip = *pos;
|
||||
+
|
||||
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
|
||||
+ if (!m->private)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node_state(nid, N_MEMORY) {
|
||||
+ if (!nr_to_skip--)
|
||||
+ return get_lruvec(memcg, nid);
|
||||
+ }
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ if (!IS_ERR_OR_NULL(v))
|
||||
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
|
||||
+
|
||||
+ kvfree(m->private);
|
||||
+ m->private = NULL;
|
||||
+}
|
||||
+
|
||||
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
+{
|
||||
+ int nid = lruvec_pgdat(v)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(v);
|
||||
+
|
||||
+ ++*pos;
|
||||
+
|
||||
+ nid = next_memory_node(nid);
|
||||
+ if (nid == MAX_NUMNODES) {
|
||||
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
|
||||
+ if (!memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ nid = first_memory_node;
|
||||
+ }
|
||||
+
|
||||
+ return get_lruvec(memcg, nid);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||
+ unsigned long max_seq, unsigned long *min_seq,
|
||||
+ unsigned long seq)
|
||||
+{
|
||||
+ int i;
|
||||
+ int type, tier;
|
||||
+ int hist = lru_hist_from_seq(seq);
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
+ seq_printf(m, " %10d", tier);
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ const char *s = " ";
|
||||
+ unsigned long n[3] = {};
|
||||
+
|
||||
+ if (seq == max_seq) {
|
||||
+ s = "RT ";
|
||||
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||
+ s = "rep";
|
||||
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
+ if (tier)
|
||||
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < 3; i++)
|
||||
+ seq_printf(m, " %10lu%c", n[i], s[i]);
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+ }
|
||||
+
|
||||
+ seq_puts(m, " ");
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ const char *s = " ";
|
||||
+ unsigned long n = 0;
|
||||
+
|
||||
+ if (seq == max_seq && NR_HIST_GENS == 1) {
|
||||
+ s = "LOYNFA";
|
||||
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
|
||||
+ s = "loynfa";
|
||||
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " %10lu%c", n, s[i]);
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ bool full = !debugfs_real_fops(m->file)->write;
|
||||
+ struct lruvec *lruvec = v;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (nid == first_memory_node) {
|
||||
+ const char *path = memcg ? m->private : "";
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
|
||||
+#endif
|
||||
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " node %5d\n", nid);
|
||||
+
|
||||
+ if (!full)
|
||||
+ seq = min_seq[LRU_GEN_ANON];
|
||||
+ else if (max_seq >= MAX_NR_GENS)
|
||||
+ seq = max_seq - MAX_NR_GENS + 1;
|
||||
+ else
|
||||
+ seq = 0;
|
||||
+
|
||||
+ for (; seq <= max_seq; seq++) {
|
||||
+ int type, zone;
|
||||
+ int gen = lru_gen_from_seq(seq);
|
||||
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+
|
||||
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
|
||||
+
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long size = 0;
|
||||
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+
|
||||
+ seq_printf(m, " %10lu%c", size, mark);
|
||||
+ }
|
||||
+
|
||||
+ seq_putc(m, '\n');
|
||||
+
|
||||
+ if (full)
|
||||
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct seq_operations lru_gen_seq_ops = {
|
||||
+ .start = lru_gen_seq_start,
|
||||
+ .stop = lru_gen_seq_stop,
|
||||
+ .next = lru_gen_seq_next,
|
||||
+ .show = lru_gen_seq_show,
|
||||
+};
|
||||
+
|
||||
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||
+ bool can_swap, bool force_scan)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < max_seq)
|
||||
+ return 0;
|
||||
+
|
||||
+ if (seq > max_seq)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||
+ int swappiness, unsigned long nr_to_reclaim)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq + MIN_NR_GENS > max_seq)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ while (!signal_pending(current)) {
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < min_seq[!swappiness])
|
||||
+ return 0;
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ return 0;
|
||||
+
|
||||
+ if (!evict_pages(lruvec, sc, swappiness, NULL))
|
||||
+ return 0;
|
||||
+
|
||||
+ cond_resched();
|
||||
+ }
|
||||
+
|
||||
+ return -EINTR;
|
||||
+}
|
||||
+
|
||||
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
|
||||
+ struct scan_control *sc, int swappiness, unsigned long opt)
|
||||
+{
|
||||
+ struct lruvec *lruvec;
|
||||
+ int err = -EINVAL;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!mem_cgroup_disabled()) {
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_id(memcg_id);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg && !css_tryget(&memcg->css))
|
||||
+ memcg = NULL;
|
||||
+#endif
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (!memcg)
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
+ if (memcg_id != mem_cgroup_id(memcg))
|
||||
+ goto done;
|
||||
+
|
||||
+ lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ if (swappiness < 0)
|
||||
+ swappiness = get_swappiness(lruvec, sc);
|
||||
+ else if (swappiness > 200)
|
||||
+ goto done;
|
||||
+
|
||||
+ switch (cmd) {
|
||||
+ case '+':
|
||||
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
|
||||
+ break;
|
||||
+ case '-':
|
||||
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||
+ size_t len, loff_t *pos)
|
||||
+{
|
||||
+ void *buf;
|
||||
+ char *cur, *next;
|
||||
+ unsigned int flags;
|
||||
+ struct blk_plug plug;
|
||||
+ int err = -EINVAL;
|
||||
+ struct scan_control sc = {
|
||||
+ .may_writepage = true,
|
||||
+ .may_unmap = true,
|
||||
+ .may_swap = true,
|
||||
+ .reclaim_idx = MAX_NR_ZONES - 1,
|
||||
+ .gfp_mask = GFP_KERNEL,
|
||||
+ };
|
||||
+
|
||||
+ buf = kvmalloc(len + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ if (copy_from_user(buf, src, len)) {
|
||||
+ kvfree(buf);
|
||||
+ return -EFAULT;
|
||||
+ }
|
||||
+
|
||||
+ set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
+ flags = memalloc_noreclaim_save();
|
||||
+ blk_start_plug(&plug);
|
||||
+ if (!set_mm_walk(NULL)) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto done;
|
||||
+ }
|
||||
+
|
||||
+ next = buf;
|
||||
+ next[len] = '\0';
|
||||
+
|
||||
+ while ((cur = strsep(&next, ",;\n"))) {
|
||||
+ int n;
|
||||
+ int end;
|
||||
+ char cmd;
|
||||
+ unsigned int memcg_id;
|
||||
+ unsigned int nid;
|
||||
+ unsigned long seq;
|
||||
+ unsigned int swappiness = -1;
|
||||
+ unsigned long opt = -1;
|
||||
+
|
||||
+ cur = skip_spaces(cur);
|
||||
+ if (!*cur)
|
||||
+ continue;
|
||||
+
|
||||
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
|
||||
+ &seq, &end, &swappiness, &end, &opt, &end);
|
||||
+ if (n < 4 || cur[end]) {
|
||||
+ err = -EINVAL;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
|
||||
+ if (err)
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ clear_mm_walk();
|
||||
+ blk_finish_plug(&plug);
|
||||
+ memalloc_noreclaim_restore(flags);
|
||||
+ set_task_reclaim_state(current, NULL);
|
||||
+
|
||||
+ kvfree(buf);
|
||||
+
|
||||
+ return err ? : len;
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ return seq_open(file, &lru_gen_seq_ops);
|
||||
+}
|
||||
+
|
||||
+static const struct file_operations lru_gen_rw_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .write = lru_gen_seq_write,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+static const struct file_operations lru_gen_ro_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -5180,6 +5568,9 @@ static int __init init_lru_gen(void)
|
||||
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
pr_err("lru_gen: failed to create sysfs group\n");
|
||||
|
||||
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
|
||||
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
|
@ -1,7 +1,7 @@
|
|||
From 14aa8b2d5c2ebead01b542f62d68029023054774 Mon Sep 17 00:00:00 2001
|
||||
From 92d430e8955c976eacb7cc91d7ff849c0dd009af Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 28 Sep 2022 13:36:58 -0600
|
||||
Subject: [PATCH 1/1] mm/mglru: don't sync disk for each aging cycle
|
||||
Subject: [PATCH 13/29] mm/mglru: don't sync disk for each aging cycle
|
||||
|
||||
wakeup_flusher_threads() was added under the assumption that if a system
|
||||
runs out of clean cold pages, it might want to write back dirty pages more
|
||||
|
@ -21,9 +21,9 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4007,8 +4007,6 @@ static bool try_to_inc_max_seq(struct lr
|
||||
if (wq_has_sleeper(&lruvec->mm_walk.wait))
|
||||
wake_up_all(&lruvec->mm_walk.wait);
|
||||
@@ -4165,8 +4165,6 @@ done:
|
||||
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
wake_up_all(&lruvec->mm_state.wait);
|
||||
|
||||
- wakeup_flusher_threads(WB_REASON_VMSCAN);
|
||||
-
|
|
@ -0,0 +1,124 @@
|
|||
From 6f315879ad750391a0b1fab8c9170bc054a5f5d7 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Tue, 15 Nov 2022 18:38:07 -0700
|
||||
Subject: [PATCH 14/29] mm: multi-gen LRU: retry pages written back while
|
||||
isolated
|
||||
|
||||
The page reclaim isolates a batch of pages from the tail of one of the
|
||||
LRU lists and works on those pages one by one. For a suitable
|
||||
swap-backed page, if the swap device is async, it queues that page for
|
||||
writeback. After the page reclaim finishes an entire batch, it puts back
|
||||
the pages it queued for writeback to the head of the original LRU list.
|
||||
|
||||
In the meantime, the page writeback flushes the queued pages also by
|
||||
batches. Its batching logic is independent from that of the page reclaim.
|
||||
For each of the pages it writes back, the page writeback calls
|
||||
rotate_reclaimable_page() which tries to rotate a page to the tail.
|
||||
|
||||
rotate_reclaimable_page() only works for a page after the page reclaim
|
||||
has put it back. If an async swap device is fast enough, the page
|
||||
writeback can finish with that page while the page reclaim is still
|
||||
working on the rest of the batch containing it. In this case, that page
|
||||
will remain at the head and the page reclaim will not retry it before
|
||||
reaching there.
|
||||
|
||||
This patch adds a retry to evict_pages(). After evict_pages() has
|
||||
finished an entire batch and before it puts back pages it cannot free
|
||||
immediately, it retries those that may have missed the rotation.
|
||||
|
||||
Before this patch, ~60% of pages swapped to an Intel Optane missed
|
||||
rotate_reclaimable_page(). After this patch, ~99% of missed pages were
|
||||
reclaimed upon retry.
|
||||
|
||||
This problem affects relatively slow async swap devices like Samsung 980
|
||||
Pro much less and does not affect sync swap devices like zram or zswap at
|
||||
all.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221116013808.3995280-1-yuzhao@google.com
|
||||
Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: "Yin, Fengwei" <fengwei.yin@intel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
|
||||
1 file changed, 37 insertions(+), 11 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4723,10 +4723,13 @@ static int evict_pages(struct lruvec *lr
|
||||
int scanned;
|
||||
int reclaimed;
|
||||
LIST_HEAD(list);
|
||||
+ LIST_HEAD(clean);
|
||||
struct page *page;
|
||||
+ struct page *next;
|
||||
enum vm_event_item item;
|
||||
struct reclaim_stat stat;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
+ bool skip_retry = false;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
@@ -4743,20 +4746,37 @@ static int evict_pages(struct lruvec *lr
|
||||
|
||||
if (list_empty(&list))
|
||||
return scanned;
|
||||
-
|
||||
+retry:
|
||||
reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
|
||||
+ sc->nr_reclaimed += reclaimed;
|
||||
|
||||
- list_for_each_entry(page, &list, lru) {
|
||||
- /* restore LRU_REFS_FLAGS cleared by isolate_page() */
|
||||
- if (PageWorkingset(page))
|
||||
- SetPageReferenced(page);
|
||||
+ list_for_each_entry_safe_reverse(page, next, &list, lru) {
|
||||
+ if (!page_evictable(page)) {
|
||||
+ list_del(&page->lru);
|
||||
+ putback_lru_page(page);
|
||||
+ continue;
|
||||
+ }
|
||||
|
||||
- /* don't add rejected pages to the oldest generation */
|
||||
if (PageReclaim(page) &&
|
||||
- (PageDirty(page) || PageWriteback(page)))
|
||||
- ClearPageActive(page);
|
||||
- else
|
||||
- SetPageActive(page);
|
||||
+ (PageDirty(page) || PageWriteback(page))) {
|
||||
+ /* restore LRU_REFS_FLAGS cleared by isolate_page() */
|
||||
+ if (PageWorkingset(page))
|
||||
+ SetPageReferenced(page);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ if (skip_retry || PageActive(page) || PageReferenced(page) ||
|
||||
+ page_mapped(page) || PageLocked(page) ||
|
||||
+ PageDirty(page) || PageWriteback(page)) {
|
||||
+ /* don't add rejected pages to the oldest generation */
|
||||
+ set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
|
||||
+ BIT(PG_active));
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ /* retry pages that may have missed rotate_reclaimable_page() */
|
||||
+ list_move(&page->lru, &clean);
|
||||
+ sc->nr_scanned -= thp_nr_pages(page);
|
||||
}
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
@@ -4778,7 +4798,13 @@ static int evict_pages(struct lruvec *lr
|
||||
mem_cgroup_uncharge_list(&list);
|
||||
free_unref_page_list(&list);
|
||||
|
||||
- sc->nr_reclaimed += reclaimed;
|
||||
+ INIT_LIST_HEAD(&list);
|
||||
+ list_splice_init(&clean, &list);
|
||||
+
|
||||
+ if (!list_empty(&list)) {
|
||||
+ skip_retry = true;
|
||||
+ goto retry;
|
||||
+ }
|
||||
|
||||
if (need_swapping && type == LRU_GEN_ANON)
|
||||
*need_swapping = true;
|
|
@ -0,0 +1,49 @@
|
|||
From 255bb0ac393f1c2818cd75af45a9226300ab3daf Mon Sep 17 00:00:00 2001
|
||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Date: Wed, 26 Oct 2022 15:48:30 +0200
|
||||
Subject: [PATCH 15/29] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off
|
||||
region
|
||||
|
||||
lru_gen_add_mm() has been added within an IRQ-off region in the commit
|
||||
mentioned below. The other invocations of lru_gen_add_mm() are not within
|
||||
an IRQ-off region.
|
||||
|
||||
The invocation within IRQ-off region is problematic on PREEMPT_RT because
|
||||
the function is using a spin_lock_t which must not be used within
|
||||
IRQ-disabled regions.
|
||||
|
||||
The other invocations of lru_gen_add_mm() occur while
|
||||
task_struct::alloc_lock is acquired. Move lru_gen_add_mm() after
|
||||
interrupts are enabled and before task_unlock().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221026134830.711887-1-bigeasy@linutronix.de
|
||||
Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Al Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
|
||||
Cc: Kees Cook <keescook@chromium.org>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
fs/exec.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/exec.c
|
||||
+++ b/fs/exec.c
|
||||
@@ -1013,7 +1013,6 @@ static int exec_mmap(struct mm_struct *m
|
||||
active_mm = tsk->active_mm;
|
||||
tsk->active_mm = mm;
|
||||
tsk->mm = mm;
|
||||
- lru_gen_add_mm(mm);
|
||||
/*
|
||||
* This prevents preemption while active_mm is being loaded and
|
||||
* it and mm are being updated, which could cause problems for
|
||||
@@ -1028,6 +1027,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
local_irq_enable();
|
||||
tsk->mm->vmacache_seqnum = 0;
|
||||
vmacache_flush(tsk);
|
||||
+ lru_gen_add_mm(mm);
|
||||
task_unlock(tsk);
|
||||
lru_gen_use_mm(mm);
|
||||
if (old_mm) {
|
|
@ -0,0 +1,96 @@
|
|||
From c5ec455ebd2b488d91de9d8915a0c8036a2a04dd Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Wed, 30 Nov 2022 14:49:41 -0800
|
||||
Subject: [PATCH 17/29] mm: add dummy pmd_young() for architectures not having
|
||||
it
|
||||
|
||||
In order to avoid #ifdeffery add a dummy pmd_young() implementation as a
|
||||
fallback. This is required for the later patch "mm: introduce
|
||||
arch_has_hw_nonleaf_pmd_young()".
|
||||
|
||||
Link: https://lkml.kernel.org/r/fd3ac3cd-7349-6bbd-890a-71a9454ca0b3@suse.com
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Borislav Petkov <bp@alien8.de>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
|
||||
Cc: "H. Peter Anvin" <hpa@zytor.com>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/mips/include/asm/pgtable.h | 1 +
|
||||
arch/riscv/include/asm/pgtable.h | 1 +
|
||||
arch/s390/include/asm/pgtable.h | 1 +
|
||||
arch/sparc/include/asm/pgtable_64.h | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 1 +
|
||||
include/linux/pgtable.h | 7 +++++++
|
||||
6 files changed, 12 insertions(+)
|
||||
|
||||
--- a/arch/mips/include/asm/pgtable.h
|
||||
+++ b/arch/mips/include/asm/pgtable.h
|
||||
@@ -632,6 +632,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pm
|
||||
return pmd;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return !!(pmd_val(pmd) & _PAGE_ACCESSED);
|
||||
--- a/arch/riscv/include/asm/pgtable.h
|
||||
+++ b/arch/riscv/include/asm/pgtable.h
|
||||
@@ -531,6 +531,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return pte_dirty(pmd_pte(pmd));
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return pte_young(pmd_pte(pmd));
|
||||
--- a/arch/s390/include/asm/pgtable.h
|
||||
+++ b/arch/s390/include/asm/pgtable.h
|
||||
@@ -748,6 +748,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
|
||||
--- a/arch/sparc/include/asm/pgtable_64.h
|
||||
+++ b/arch/sparc/include/asm/pgtable_64.h
|
||||
@@ -712,6 +712,7 @@ static inline unsigned long pmd_dirty(pm
|
||||
return pte_dirty(pte);
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline unsigned long pmd_young(pmd_t pmd)
|
||||
{
|
||||
pte_t pte = __pte(pmd_val(pmd));
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -136,6 +136,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return pmd_flags(pmd) & _PAGE_DIRTY;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return pmd_flags(pmd) & _PAGE_ACCESSED;
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -164,6 +164,13 @@ static inline pte_t *virt_to_kpte(unsign
|
||||
return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
|
||||
}
|
||||
|
||||
+#ifndef pmd_young
|
||||
+static inline int pmd_young(pmd_t pmd)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
|
||||
extern int ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep,
|
|
@ -0,0 +1,113 @@
|
|||
From 46cbda7b65998a5af4493f745d94417af697bd68 Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Wed, 23 Nov 2022 07:45:10 +0100
|
||||
Subject: [PATCH 18/29] mm: introduce arch_has_hw_nonleaf_pmd_young()
|
||||
|
||||
When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation in
|
||||
pmdp_test_and_clear_young():
|
||||
|
||||
BUG: unable to handle page fault for address: ffff8880083374d0
|
||||
#PF: supervisor write access in kernel mode
|
||||
#PF: error_code(0x0003) - permissions violation
|
||||
PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
|
||||
Oops: 0003 [#1] PREEMPT SMP NOPTI
|
||||
CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
|
||||
RIP: e030:pmdp_test_and_clear_young+0x25/0x40
|
||||
|
||||
This happens because the Xen hypervisor can't emulate direct writes to
|
||||
page table entries other than PTEs.
|
||||
|
||||
This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
|
||||
similar to arch_has_hw_pte_young() and test that instead of
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221123064510.16225-1-jgross@suse.com
|
||||
Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Acked-by: David Hildenbrand <david@redhat.com> [core changes]
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/x86/include/asm/pgtable.h | 8 ++++++++
|
||||
include/linux/pgtable.h | 11 +++++++++++
|
||||
mm/vmscan.c | 10 +++++-----
|
||||
3 files changed, 24 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1405,6 +1405,14 @@ static inline bool arch_has_hw_pte_young
|
||||
return true;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_XEN_PV
|
||||
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
|
||||
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
|
||||
+{
|
||||
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_H */
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -266,6 +266,17 @@ static inline int pmdp_clear_flush_young
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
+#ifndef arch_has_hw_nonleaf_pmd_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
|
||||
+ * local CPU.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
|
||||
+{
|
||||
+ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef arch_has_hw_pte_young
|
||||
/*
|
||||
* Return whether the accessed bit is supported on the local CPU.
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3727,7 +3727,7 @@ static void walk_pmd_range_locked(pud_t
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() &&
|
||||
get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
@@ -3825,14 +3825,14 @@ restart:
|
||||
#endif
|
||||
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||
|
||||
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() &&
|
||||
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
if (!pmd_young(val))
|
||||
continue;
|
||||
|
||||
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||
continue;
|
||||
|
||||
@@ -5132,7 +5132,7 @@ static ssize_t show_enabled(struct kobje
|
||||
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
caps |= BIT(LRU_GEN_MM_WALK);
|
||||
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
|
@ -0,0 +1,56 @@
|
|||
From c7dfefd4bdfba3d5171038d1cc2d4160288e6ee4 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 15 Jan 2023 20:44:05 -0700
|
||||
Subject: [PATCH 16/29] mm: multi-gen LRU: fix crash during cgroup migration
|
||||
|
||||
lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself. This
|
||||
isn't true for the following scenario:
|
||||
|
||||
CPU 1 CPU 2
|
||||
|
||||
clone()
|
||||
cgroup_can_fork()
|
||||
cgroup_procs_write()
|
||||
cgroup_post_fork()
|
||||
task_lock()
|
||||
lru_gen_migrate_mm()
|
||||
task_unlock()
|
||||
task_lock()
|
||||
lru_gen_add_mm()
|
||||
task_unlock()
|
||||
|
||||
And when the above happens, kernel crashes because of linked list
|
||||
corruption (mm_struct->lru_gen.list).
|
||||
|
||||
Link: https://lore.kernel.org/r/20230115134651.30028-1-msizanoen@qtmlabs.xyz/
|
||||
Link: https://lkml.kernel.org/r/20230116034405.2960276-1-yuzhao@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: msizanoen <msizanoen@qtmlabs.xyz>
|
||||
Tested-by: msizanoen <msizanoen@qtmlabs.xyz>
|
||||
Cc: <stable@vger.kernel.org> [6.1+]
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3024,13 +3024,16 @@ void lru_gen_migrate_mm(struct mm_struct
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
+ /* migration can happen before addition */
|
||||
+ if (!mm->lru_gen.memcg)
|
||||
+ return;
|
||||
+
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(task);
|
||||
rcu_read_unlock();
|
||||
if (memcg == mm->lru_gen.memcg)
|
||||
return;
|
||||
|
||||
- VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
|
||||
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
|
||||
|
||||
lru_gen_del_mm(mm);
|
|
@ -0,0 +1,196 @@
|
|||
From 6c7f552a48b49a8612786a28a2239fbc24fac289 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Fri, 30 Dec 2022 14:52:51 -0700
|
||||
Subject: [PATCH 19/29] mm: add vma_has_recency()
|
||||
|
||||
Add vma_has_recency() to indicate whether a VMA may exhibit temporal
|
||||
locality that the LRU algorithm relies on.
|
||||
|
||||
This function returns false for VMAs marked by VM_SEQ_READ or
|
||||
VM_RAND_READ. While the former flag indicates linear access, i.e., a
|
||||
special case of spatial locality, both flags indicate a lack of temporal
|
||||
locality, i.e., the reuse of an area within a relatively small duration.
|
||||
|
||||
"Recency" is chosen over "locality" to avoid confusion between temporal
|
||||
and spatial localities.
|
||||
|
||||
Before this patch, the active/inactive LRU only ignored the accessed bit
|
||||
from VMAs marked by VM_SEQ_READ. After this patch, the active/inactive
|
||||
LRU and MGLRU share the same logic: they both ignore the accessed bit if
|
||||
vma_has_recency() returns false.
|
||||
|
||||
For the active/inactive LRU, the following fio test showed a [6, 8]%
|
||||
increase in IOPS when randomly accessing mapped files under memory
|
||||
pressure.
|
||||
|
||||
kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
|
||||
kb=$((kb - 8*1024*1024))
|
||||
|
||||
modprobe brd rd_nr=1 rd_size=$kb
|
||||
dd if=/dev/zero of=/dev/ram0 bs=1M
|
||||
|
||||
mkfs.ext4 /dev/ram0
|
||||
mount /dev/ram0 /mnt/
|
||||
swapoff -a
|
||||
|
||||
fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \
|
||||
--size=8G --rw=randrw --time_based --runtime=10m \
|
||||
--group_reporting
|
||||
|
||||
The discussion that led to this patch is here [1]. Additional test
|
||||
results are available in that thread.
|
||||
|
||||
[1] https://lore.kernel.org/r/Y31s%2FK8T85jh05wH@google.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221230215252.2628425-1-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: Andrea Righi <andrea.righi@canonical.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 9 +++++++++
|
||||
mm/memory.c | 8 ++++----
|
||||
mm/rmap.c | 42 +++++++++++++++++----------------------
|
||||
mm/vmscan.c | 5 ++++-
|
||||
4 files changed, 35 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -333,4 +333,13 @@ static __always_inline void del_page_fro
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
}
|
||||
+
|
||||
+static inline bool vma_has_recency(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -41,6 +41,7 @@
|
||||
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/mm.h>
|
||||
+#include <linux/mm_inline.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/coredump.h>
|
||||
#include <linux/sched/numa_balancing.h>
|
||||
@@ -1353,8 +1354,7 @@ again:
|
||||
force_flush = 1;
|
||||
set_page_dirty(page);
|
||||
}
|
||||
- if (pte_young(ptent) &&
|
||||
- likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
rss[mm_counter(page)]--;
|
||||
@@ -4781,8 +4781,8 @@ static inline void mm_account_fault(stru
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
{
|
||||
- /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||
- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||
+ /* the LRU algorithm only applies to accesses with recency */
|
||||
+ current->in_lru_fault = vma_has_recency(vma);
|
||||
}
|
||||
|
||||
static void lru_gen_exit_fault(void)
|
||||
--- a/mm/rmap.c
|
||||
+++ b/mm/rmap.c
|
||||
@@ -794,25 +794,14 @@ static bool page_referenced_one(struct p
|
||||
}
|
||||
|
||||
if (pvmw.pte) {
|
||||
- if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||
- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
|
||||
lru_gen_look_around(&pvmw);
|
||||
referenced++;
|
||||
}
|
||||
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
- pvmw.pte)) {
|
||||
- /*
|
||||
- * Don't treat a reference through
|
||||
- * a sequentially read mapping as such.
|
||||
- * If the page has been used in another mapping,
|
||||
- * we will catch it; if this other mapping is
|
||||
- * already gone, the unmap path will have set
|
||||
- * PG_referenced or activated the page.
|
||||
- */
|
||||
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
- referenced++;
|
||||
- }
|
||||
+ pvmw.pte))
|
||||
+ referenced++;
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
if (pmdp_clear_flush_young_notify(vma, address,
|
||||
pvmw.pmd))
|
||||
@@ -846,7 +835,20 @@ static bool invalid_page_referenced_vma(
|
||||
struct page_referenced_arg *pra = arg;
|
||||
struct mem_cgroup *memcg = pra->memcg;
|
||||
|
||||
- if (!mm_match_cgroup(vma->vm_mm, memcg))
|
||||
+ /*
|
||||
+ * Ignore references from this mapping if it has no recency. If the
|
||||
+ * page has been used in another mapping, we will catch it; if this
|
||||
+ * other mapping is already gone, the unmap path will have set the
|
||||
+ * referenced flag or activated the page in zap_pte_range().
|
||||
+ */
|
||||
+ if (!vma_has_recency(vma))
|
||||
+ return true;
|
||||
+
|
||||
+ /*
|
||||
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
|
||||
+ * of references from different cgroups.
|
||||
+ */
|
||||
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@@ -876,6 +878,7 @@ int page_referenced(struct page *page,
|
||||
.rmap_one = page_referenced_one,
|
||||
.arg = (void *)&pra,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
+ .invalid_vma = invalid_page_referenced_vma,
|
||||
};
|
||||
|
||||
*vm_flags = 0;
|
||||
@@ -891,15 +894,6 @@ int page_referenced(struct page *page,
|
||||
return 1;
|
||||
}
|
||||
|
||||
- /*
|
||||
- * If we are reclaiming on behalf of a cgroup, skip
|
||||
- * counting on behalf of references from different
|
||||
- * cgroups
|
||||
- */
|
||||
- if (memcg) {
|
||||
- rwc.invalid_vma = invalid_page_referenced_vma;
|
||||
- }
|
||||
-
|
||||
rmap_walk(page, &rwc);
|
||||
*vm_flags = pra.vm_flags;
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3486,7 +3486,10 @@ static int should_skip_vma(unsigned long
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return true;
|
||||
|
||||
- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
|
||||
+ if (!vma_has_recency(vma))
|
||||
+ return true;
|
||||
+
|
||||
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
|
||||
return true;
|
||||
|
||||
if (vma == get_gate_vma(vma->vm_mm))
|
|
@ -0,0 +1,125 @@
|
|||
From 686c3d4f71de9e0e7a27f03a5617a712385f90cd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Fri, 30 Dec 2022 14:52:52 -0700
|
||||
Subject: [PATCH 20/29] mm: support POSIX_FADV_NOREUSE
|
||||
|
||||
This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU
|
||||
algorithm can ignore access to mapped files marked by this flag.
|
||||
|
||||
The advantages of POSIX_FADV_NOREUSE are:
|
||||
1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the
|
||||
default readahead behavior.
|
||||
2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and
|
||||
therefore does not take mmap_lock.
|
||||
3. Unlike MADV_COLD, setting it has a negligible cost, regardless of
|
||||
how many pages it affects.
|
||||
|
||||
Its limitations are:
|
||||
1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does
|
||||
not support range. IOW, its scope is the entire file.
|
||||
2. It currently does not ignore access through file descriptors.
|
||||
Specifically, for the active/inactive LRU, given a file page shared
|
||||
by two users and one of them having set POSIX_FADV_NOREUSE on the
|
||||
file, this page will be activated upon the second user accessing
|
||||
it. This corner case can be covered by checking POSIX_FADV_NOREUSE
|
||||
before calling mark_page_accessed() on the read path. But it is
|
||||
considered not worth the effort.
|
||||
|
||||
There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1].
|
||||
This time the goal is to fill a niche: a few desktop applications, e.g.,
|
||||
large file transferring and video encoding/decoding, want fast file
|
||||
streaming with mmap() rather than direct IO. Among those applications, an
|
||||
SVT-AV1 regression was reported when running with MGLRU [2]. The
|
||||
following test can reproduce that regression.
|
||||
|
||||
kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
|
||||
kb=$((kb - 8*1024*1024))
|
||||
|
||||
modprobe brd rd_nr=1 rd_size=$kb
|
||||
dd if=/dev/zero of=/dev/ram0 bs=1M
|
||||
|
||||
mkfs.ext4 /dev/ram0
|
||||
mount /dev/ram0 /mnt/
|
||||
swapoff -a
|
||||
|
||||
fallocate -l 8G /mnt/swapfile
|
||||
mkswap /mnt/swapfile
|
||||
swapon /mnt/swapfile
|
||||
|
||||
wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
|
||||
7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
|
||||
SvtAv1EncApp --preset 12 -w 3840 -h 2160 \
|
||||
-i /mnt/Bosphorus_3840x2160.y4m
|
||||
|
||||
For MGLRU, the following change showed a [9-11]% increase in FPS,
|
||||
which makes it on par with the active/inactive LRU.
|
||||
|
||||
patch Source/App/EncApp/EbAppMain.c <<EOF
|
||||
31a32
|
||||
> #include <fcntl.h>
|
||||
35d35
|
||||
< #include <fcntl.h> /* _O_BINARY */
|
||||
117a118
|
||||
> posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE);
|
||||
EOF
|
||||
|
||||
[1] https://lore.kernel.org/r/1308923350-7932-1-git-send-email-andrea@betterlinux.com/
|
||||
[2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221230215252.2628425-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: Andrea Righi <andrea.righi@canonical.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/fs.h | 2 ++
|
||||
include/linux/mm_inline.h | 3 +++
|
||||
mm/fadvise.c | 5 ++++-
|
||||
3 files changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/fs.h
|
||||
+++ b/include/linux/fs.h
|
||||
@@ -167,6 +167,8 @@ typedef int (dio_iodone_t)(struct kiocb
|
||||
/* File is stream-like */
|
||||
#define FMODE_STREAM ((__force fmode_t)0x200000)
|
||||
|
||||
+#define FMODE_NOREUSE ((__force fmode_t)0x400000)
|
||||
+
|
||||
/* File was opened by fanotify and shouldn't generate fanotify events */
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -339,6 +339,9 @@ static inline bool vma_has_recency(struc
|
||||
if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
|
||||
return false;
|
||||
|
||||
+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
|
||||
+ return false;
|
||||
+
|
||||
return true;
|
||||
}
|
||||
|
||||
--- a/mm/fadvise.c
|
||||
+++ b/mm/fadvise.c
|
||||
@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, l
|
||||
case POSIX_FADV_NORMAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages;
|
||||
spin_lock(&file->f_lock);
|
||||
- file->f_mode &= ~FMODE_RANDOM;
|
||||
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
|
||||
spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_RANDOM:
|
||||
@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, l
|
||||
force_page_cache_readahead(mapping, file, start_index, nrpages);
|
||||
break;
|
||||
case POSIX_FADV_NOREUSE:
|
||||
+ spin_lock(&file->f_lock);
|
||||
+ file->f_mode |= FMODE_NOREUSE;
|
||||
+ spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_DONTNEED:
|
||||
if (!inode_write_congested(mapping->host))
|
|
@ -0,0 +1,348 @@
|
|||
From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:18:59 -0700
|
||||
Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to
|
||||
lru_gen_page
|
||||
|
||||
Patch series "mm: multi-gen LRU: memcg LRU", v3.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
|
||||
since each node and memcg combination has an LRU of pages (see
|
||||
mem_cgroup_lruvec()).
|
||||
|
||||
Its goal is to improve the scalability of global reclaim, which is
|
||||
critical to system-wide memory overcommit in data centers. Note that
|
||||
memcg reclaim is currently out of scope.
|
||||
|
||||
Its memory bloat is a pointer to each lruvec and negligible to each
|
||||
pglist_data. In terms of traversing memcgs during global reclaim, it
|
||||
improves the best-case complexity from O(n) to O(1) and does not affect
|
||||
the worst-case complexity O(n). Therefore, on average, it has a sublinear
|
||||
complexity in contrast to the current linear complexity.
|
||||
|
||||
The basic structure of an memcg LRU can be understood by an analogy to
|
||||
the active/inactive LRU (of pages):
|
||||
1. It has the young and the old (generations), i.e., the counterparts
|
||||
to the active and the inactive;
|
||||
2. The increment of max_seq triggers promotion, i.e., the counterpart
|
||||
to activation;
|
||||
3. Other events trigger similar operations, e.g., offlining an memcg
|
||||
triggers demotion, i.e., the counterpart to deactivation.
|
||||
|
||||
In terms of global reclaim, it has two distinct features:
|
||||
1. Sharding, which allows each thread to start at a random memcg (in
|
||||
the old generation) and improves parallelism;
|
||||
2. Eventual fairness, which allows direct reclaim to bail out at will
|
||||
and reduces latency without affecting fairness over some time.
|
||||
|
||||
The commit message in patch 6 details the workflow:
|
||||
https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
|
||||
|
||||
The following is a simple test to quickly verify its effectiveness.
|
||||
|
||||
Test design:
|
||||
1. Create multiple memcgs.
|
||||
2. Each memcg contains a job (fio).
|
||||
3. All jobs access the same amount of memory randomly.
|
||||
4. The system does not experience global memory pressure.
|
||||
5. Periodically write to the root memory.reclaim.
|
||||
|
||||
Desired outcome:
|
||||
1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
|
||||
over mean(pgsteal) is close to 0%.
|
||||
2. The total pgsteal is close to the total requested through
|
||||
memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
|
||||
to 100%.
|
||||
|
||||
Actual outcome [1]:
|
||||
MGLRU off MGLRU on
|
||||
stddev(pgsteal) / mean(pgsteal) 75% 20%
|
||||
sum(pgsteal) / sum(requested) 425% 95%
|
||||
|
||||
####################################################################
|
||||
MEMCGS=128
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
mkdir /sys/fs/cgroup/memcg$memcg
|
||||
done
|
||||
|
||||
start() {
|
||||
echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
|
||||
|
||||
fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
|
||||
--filename=/dev/zero --size=1920M --rw=randrw \
|
||||
--rate=64m,64m --random_distribution=random \
|
||||
--fadvise_hint=0 --time_based --runtime=10h \
|
||||
--group_reporting --minimal
|
||||
}
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
start &
|
||||
done
|
||||
|
||||
sleep 600
|
||||
|
||||
for ((i = 0; i < 600; i++)); do
|
||||
echo 256m >/sys/fs/cgroup/memory.reclaim
|
||||
sleep 6
|
||||
done
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
|
||||
done
|
||||
####################################################################
|
||||
|
||||
[1]: This was obtained from running the above script (touches less
|
||||
than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
|
||||
hour.
|
||||
|
||||
This patch (of 8):
|
||||
|
||||
The new name lru_gen_page will be more distinct from the coming
|
||||
lru_gen_memcg.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 4 ++--
|
||||
include/linux/mmzone.h | 6 +++---
|
||||
mm/vmscan.c | 34 +++++++++++++++++-----------------
|
||||
mm/workingset.c | 4 ++--
|
||||
4 files changed, 24 insertions(+), 24 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -168,7 +168,7 @@ static inline void lru_gen_update_size(s
|
||||
int zone = page_zonenum(page);
|
||||
int delta = thp_nr_pages(page);
|
||||
enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
@@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(stru
|
||||
int gen = page_lru_gen(page);
|
||||
int type = page_is_file_lru(page);
|
||||
int zone = page_zonenum(page);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -394,7 +394,7 @@ enum {
|
||||
* The number of pages in each generation is eventually consistent and therefore
|
||||
* can be transiently negative when reset_batch_size() is pending.
|
||||
*/
|
||||
-struct lru_gen_struct {
|
||||
+struct lru_gen_page {
|
||||
/* the aging increments the youngest generation number */
|
||||
unsigned long max_seq;
|
||||
/* the eviction increments the oldest generation numbers */
|
||||
@@ -451,7 +451,7 @@ struct lru_gen_mm_state {
|
||||
struct lru_gen_mm_walk {
|
||||
/* the lruvec under reclaim */
|
||||
struct lruvec *lruvec;
|
||||
- /* unstable max_seq from lru_gen_struct */
|
||||
+ /* unstable max_seq from lru_gen_page */
|
||||
unsigned long max_seq;
|
||||
/* the next address within an mm to scan */
|
||||
unsigned long next_addr;
|
||||
@@ -514,7 +514,7 @@ struct lruvec {
|
||||
unsigned long flags;
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* evictable pages divided into generations */
|
||||
- struct lru_gen_struct lrugen;
|
||||
+ struct lru_gen_page lrugen;
|
||||
/* to concurrently iterate lru_gen_mm_list */
|
||||
struct lru_gen_mm_state mm_state;
|
||||
#endif
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lr
|
||||
|
||||
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
- /* see the comment on lru_gen_struct */
|
||||
+ /* see the comment on lru_gen_page */
|
||||
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
|
||||
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
|
||||
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
|
||||
@@ -3316,7 +3316,7 @@ struct ctrl_pos {
|
||||
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
|
||||
struct ctrl_pos *pos)
|
||||
{
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
||||
@@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec
|
||||
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
|
||||
{
|
||||
int hist, tier;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
|
||||
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
|
||||
|
||||
@@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *
|
||||
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
int type = page_is_file_lru(page);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
||||
|
||||
@@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru
|
||||
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
|
||||
{
|
||||
int gen, type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
walk->batched = 0;
|
||||
|
||||
@@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
{
|
||||
int zone;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
if (type == LRU_GEN_ANON && !can_swap)
|
||||
@@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lr
|
||||
{
|
||||
int gen, type, zone;
|
||||
bool success = false;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
@@ -4036,7 +4036,7 @@ next:
|
||||
;
|
||||
}
|
||||
|
||||
- /* see the comment on lru_gen_struct */
|
||||
+ /* see the comment on lru_gen_page */
|
||||
if (can_swap) {
|
||||
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
|
||||
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
|
||||
@@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
{
|
||||
int prev, next;
|
||||
int type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
|
||||
@@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
bool success;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
struct mm_struct *mm = NULL;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
|
||||
|
||||
@@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruv
|
||||
unsigned long old = 0;
|
||||
unsigned long young = 0;
|
||||
unsigned long total = 0;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
|
||||
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
@@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lru
|
||||
int delta = thp_nr_pages(page);
|
||||
int refs = page_lru_refs(page);
|
||||
int tier = lru_tier_from_refs(refs);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
|
||||
|
||||
@@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lru
|
||||
int scanned = 0;
|
||||
int isolated = 0;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
|
||||
VM_WARN_ON_ONCE(!list_empty(list));
|
||||
@@ -4967,7 +4967,7 @@ done:
|
||||
|
||||
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
if (lrugen->enabled) {
|
||||
enum lru_list lru;
|
||||
@@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct
|
||||
int i;
|
||||
int type, tier;
|
||||
int hist = lru_hist_from_seq(seq);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
seq_printf(m, " %10d", tier);
|
||||
@@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_f
|
||||
unsigned long seq;
|
||||
bool full = !debugfs_real_fops(m->file)->write;
|
||||
struct lruvec *lruvec = v;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
@@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
{
|
||||
int i;
|
||||
int gen, type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
--- a/mm/workingset.c
|
||||
+++ b/mm/workingset.c
|
||||
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct pag
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
- struct lru_gen_struct *lrugen;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
int type = page_is_file_lru(page);
|
||||
int delta = thp_nr_pages(page);
|
||||
int refs = page_lru_refs(page);
|
||||
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct page
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
- struct lru_gen_struct *lrugen;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
struct mem_cgroup *memcg;
|
||||
struct pglist_data *pgdat;
|
||||
int type = page_is_file_lru(page);
|
|
@ -0,0 +1,162 @@
|
|||
From afd37e73db04c7e6b47411120ac5f6a7eca51fec Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:00 -0700
|
||||
Subject: [PATCH 22/29] mm: multi-gen LRU: rename lrugen->lists[] to
|
||||
lrugen->pages[]
|
||||
|
||||
lru_gen_page will be chained into per-node lists by the coming
|
||||
lrugen->list.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 4 ++--
|
||||
include/linux/mmzone.h | 8 ++++----
|
||||
mm/vmscan.c | 20 ++++++++++----------
|
||||
3 files changed, 16 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -246,9 +246,9 @@ static inline bool lru_gen_add_page(stru
|
||||
lru_gen_update_size(lruvec, page, -1, gen);
|
||||
/* for rotate_reclaimable_page() */
|
||||
if (reclaiming)
|
||||
- list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_add_tail(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
else
|
||||
- list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_add(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
|
||||
return true;
|
||||
}
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -302,7 +302,7 @@ enum lruvec_flags {
|
||||
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||
* corresponding generation. The gen counter in page->flags stores gen+1 while
|
||||
- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||
+ * a page is on one of lrugen->pages[]. Otherwise it stores 0.
|
||||
*
|
||||
* A page is added to the youngest generation on faulting. The aging needs to
|
||||
* check the accessed bit at least twice before handing this page over to the
|
||||
@@ -314,8 +314,8 @@ enum lruvec_flags {
|
||||
* rest of generations, if they exist, are considered inactive. See
|
||||
* lru_gen_is_active().
|
||||
*
|
||||
- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||
- * the aging needs not to worry about it. And it's set again when a page
|
||||
+ * PG_active is always cleared while a page is on one of lrugen->pages[] so
|
||||
+ * that the aging needs not to worry about it. And it's set again when a page
|
||||
* considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||
* See lru_gen_add_page() and lru_gen_del_page().
|
||||
*
|
||||
@@ -402,7 +402,7 @@ struct lru_gen_page {
|
||||
/* the birth time of each generation in jiffies */
|
||||
unsigned long timestamps[MAX_NR_GENS];
|
||||
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ struct list_head pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the exponential moving average of refaulted */
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3987,7 +3987,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
|
||||
/* prevent cold/hot inversion if force_scan is true */
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
- struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||
+ struct list_head *head = &lrugen->pages[old_gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
struct page *page = lru_to_page(head);
|
||||
@@ -3998,7 +3998,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
new_gen = page_inc_gen(lruvec, page, false);
|
||||
- list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
|
||||
+ list_move_tail(&page->lru, &lrugen->pages[new_gen][type][zone]);
|
||||
|
||||
if (!--remaining)
|
||||
return false;
|
||||
@@ -4026,7 +4026,7 @@ static bool try_to_inc_min_seq(struct lr
|
||||
gen = lru_gen_from_seq(min_seq[type]);
|
||||
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
- if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ if (!list_empty(&lrugen->pages[gen][type][zone]))
|
||||
goto next;
|
||||
}
|
||||
|
||||
@@ -4491,7 +4491,7 @@ static bool sort_page(struct lruvec *lru
|
||||
|
||||
/* promoted */
|
||||
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||
- list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4500,7 +4500,7 @@ static bool sort_page(struct lruvec *lru
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
gen = page_inc_gen(lruvec, page, false);
|
||||
- list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move_tail(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
|
||||
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
|
||||
lrugen->protected[hist][type][tier - 1] + delta);
|
||||
@@ -4512,7 +4512,7 @@ static bool sort_page(struct lruvec *lru
|
||||
if (PageLocked(page) || PageWriteback(page) ||
|
||||
(type == LRU_GEN_FILE && PageDirty(page))) {
|
||||
gen = page_inc_gen(lruvec, page, true);
|
||||
- list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4579,7 +4579,7 @@ static int scan_pages(struct lruvec *lru
|
||||
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
|
||||
LIST_HEAD(moved);
|
||||
int skipped = 0;
|
||||
- struct list_head *head = &lrugen->lists[gen][type][zone];
|
||||
+ struct list_head *head = &lrugen->pages[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
struct page *page = lru_to_page(head);
|
||||
@@ -4980,7 +4980,7 @@ static bool __maybe_unused state_is_vali
|
||||
int gen, type, zone;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
- if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ if (!list_empty(&lrugen->pages[gen][type][zone]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -5025,7 +5025,7 @@ static bool drain_evictable(struct lruve
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
+ struct list_head *head = &lruvec->lrugen.pages[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
bool success;
|
||||
@@ -5558,7 +5558,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
lrugen->timestamps[i] = jiffies;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+ INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
|
||||
|
||||
lruvec->mm_state.seq = MIN_NR_GENS;
|
||||
init_waitqueue_head(&lruvec->mm_state.wait);
|
|
@ -0,0 +1,188 @@
|
|||
From ce45f1c4b32cf69b166f56ef5bc6c761e06ed4e5 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:01 -0700
|
||||
Subject: [PATCH 23/29] mm: multi-gen LRU: remove eviction fairness safeguard
|
||||
|
||||
Recall that the eviction consumes the oldest generation: first it
|
||||
bucket-sorts pages whose gen counters were updated by the aging and
|
||||
reclaims the rest; then it increments lrugen->min_seq.
|
||||
|
||||
The current eviction fairness safeguard for global reclaim has a
|
||||
dilemma: when there are multiple eligible memcgs, should it continue
|
||||
or stop upon meeting the reclaim goal? If it continues, it overshoots
|
||||
and increases direct reclaim latency; if it stops, it loses fairness
|
||||
between memcgs it has taken memory away from and those it has yet to.
|
||||
|
||||
With memcg LRU, the eviction, while ensuring eventual fairness, will
|
||||
stop upon meeting its goal. Therefore the current eviction fairness
|
||||
safeguard for global reclaim will not be needed.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim. For memcg reclaim,
|
||||
the eviction will continue, even if it is overshooting. This becomes
|
||||
unconditional due to code simplification.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-4-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 82 +++++++++++++++--------------------------------------
|
||||
1 file changed, 23 insertions(+), 59 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -443,6 +443,11 @@ static bool cgroup_reclaim(struct scan_c
|
||||
return sc->target_mem_cgroup;
|
||||
}
|
||||
|
||||
+static bool global_reclaim(struct scan_control *sc)
|
||||
+{
|
||||
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
|
||||
* @sc: scan_control in question
|
||||
@@ -493,6 +498,11 @@ static bool cgroup_reclaim(struct scan_c
|
||||
return false;
|
||||
}
|
||||
|
||||
+static bool global_reclaim(struct scan_control *sc)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static bool writeback_throttling_sane(struct scan_control *sc)
|
||||
{
|
||||
return true;
|
||||
@@ -4722,8 +4732,7 @@ static int isolate_pages(struct lruvec *
|
||||
return scanned;
|
||||
}
|
||||
|
||||
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
- bool *need_swapping)
|
||||
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||
{
|
||||
int type;
|
||||
int scanned;
|
||||
@@ -4812,9 +4821,6 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
- if (need_swapping && type == LRU_GEN_ANON)
|
||||
- *need_swapping = true;
|
||||
-
|
||||
return scanned;
|
||||
}
|
||||
|
||||
@@ -4853,68 +4859,26 @@ done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
}
|
||||
|
||||
-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||
- struct scan_control *sc, bool need_swapping)
|
||||
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
{
|
||||
- int i;
|
||||
- DEFINE_MAX_SEQ(lruvec);
|
||||
-
|
||||
- if (!current_is_kswapd()) {
|
||||
- /* age each memcg once to ensure fairness */
|
||||
- if (max_seq - seq > 1)
|
||||
- return true;
|
||||
-
|
||||
- /* over-swapping can increase allocation latency */
|
||||
- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||
- return true;
|
||||
-
|
||||
- /* give this thread a chance to exit and free its memory */
|
||||
- if (fatal_signal_pending(current)) {
|
||||
- sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
- return true;
|
||||
- }
|
||||
-
|
||||
- if (cgroup_reclaim(sc))
|
||||
- return false;
|
||||
- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||
- return false;
|
||||
-
|
||||
- /* keep scanning at low priorities to ensure fairness */
|
||||
- if (sc->priority > DEF_PRIORITY - 2)
|
||||
- return false;
|
||||
-
|
||||
- /*
|
||||
- * A minimum amount of work was done under global memory pressure. For
|
||||
- * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
||||
- * met, and yet the allocation may still succeed, since kswapd may have
|
||||
- * caught up. In either case, it's better to stop now, and restart if
|
||||
- * necessary.
|
||||
- */
|
||||
- for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||
- unsigned long wmark;
|
||||
- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||
-
|
||||
- if (!managed_zone(zone))
|
||||
- continue;
|
||||
-
|
||||
- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||
- if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||
- return false;
|
||||
- }
|
||||
+ /* don't abort memcg reclaim to ensure fairness */
|
||||
+ if (!global_reclaim(sc))
|
||||
+ return -1;
|
||||
|
||||
- sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+ /* discount the previous progress for kswapd */
|
||||
+ if (current_is_kswapd())
|
||||
+ return sc->nr_to_reclaim + sc->last_reclaimed;
|
||||
|
||||
- return true;
|
||||
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
bool need_aging = false;
|
||||
- bool need_swapping = false;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long reclaimed = sc->nr_reclaimed;
|
||||
- DEFINE_MAX_SEQ(lruvec);
|
||||
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
@@ -4938,7 +4902,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
if (!nr_to_scan)
|
||||
goto done;
|
||||
|
||||
- delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
||||
+ delta = evict_pages(lruvec, sc, swappiness);
|
||||
if (!delta)
|
||||
goto done;
|
||||
|
||||
@@ -4946,7 +4910,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
if (scanned >= nr_to_scan)
|
||||
break;
|
||||
|
||||
- if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
@@ -5393,7 +5357,7 @@ static int run_eviction(struct lruvec *l
|
||||
if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
return 0;
|
||||
|
||||
- if (!evict_pages(lruvec, sc, swappiness, NULL))
|
||||
+ if (!evict_pages(lruvec, sc, swappiness))
|
||||
return 0;
|
||||
|
||||
cond_resched();
|
|
@ -0,0 +1,287 @@
|
|||
From e20b7386fccc18c791796eb1dc1a91eee3ccf801 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:02 -0700
|
||||
Subject: [PATCH 24/29] mm: multi-gen LRU: remove aging fairness safeguard
|
||||
|
||||
Recall that the aging produces the youngest generation: first it scans
|
||||
for accessed pages and updates their gen counters; then it increments
|
||||
lrugen->max_seq.
|
||||
|
||||
The current aging fairness safeguard for kswapd uses two passes to
|
||||
ensure the fairness to multiple eligible memcgs. On the first pass,
|
||||
which is shared with the eviction, it checks whether all eligible
|
||||
memcgs are low on cold pages. If so, it requires a second pass, on
|
||||
which it ages all those memcgs at the same time.
|
||||
|
||||
With memcg LRU, the aging, while ensuring eventual fairness, will run
|
||||
when necessary. Therefore the current aging fairness safeguard for
|
||||
kswapd will not be needed.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim. For memcg reclaim,
|
||||
the aging can be unfair to different memcgs, i.e., their
|
||||
lrugen->max_seq can be incremented at different paces.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 126 ++++++++++++++++++++++++----------------------------
|
||||
1 file changed, 59 insertions(+), 67 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -131,7 +131,6 @@ struct scan_control {
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* help kswapd make better choices among multiple memcgs */
|
||||
- unsigned int memcgs_need_aging:1;
|
||||
unsigned long last_reclaimed;
|
||||
#endif
|
||||
|
||||
@@ -4184,7 +4183,7 @@ done:
|
||||
return true;
|
||||
}
|
||||
|
||||
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
|
||||
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
{
|
||||
int gen, type, zone;
|
||||
@@ -4193,6 +4192,13 @@ static bool should_run_aging(struct lruv
|
||||
unsigned long total = 0;
|
||||
struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ /* whether this lruvec is completely out of cold pages */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
+ *nr_to_scan = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
|
||||
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
unsigned long seq;
|
||||
@@ -4221,8 +4227,6 @@ static bool should_run_aging(struct lruv
|
||||
* stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
* ideal number of generations is MIN_NR_GENS+1.
|
||||
*/
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
|
||||
- return true;
|
||||
if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
return false;
|
||||
|
||||
@@ -4241,40 +4245,54 @@ static bool should_run_aging(struct lruv
|
||||
return false;
|
||||
}
|
||||
|
||||
-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
- bool need_aging;
|
||||
- unsigned long nr_to_scan;
|
||||
- int swappiness = get_swappiness(lruvec, sc);
|
||||
+ int gen, type, zone;
|
||||
+ unsigned long total = 0;
|
||||
+ bool can_swap = get_swappiness(lruvec, sc);
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long seq;
|
||||
|
||||
- mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
|
||||
- if (mem_cgroup_below_min(memcg))
|
||||
- return false;
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||
+ /* whether the size is big enough to be helpful */
|
||||
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
+}
|
||||
|
||||
- if (min_ttl) {
|
||||
- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
|
||||
+ unsigned long min_ttl)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long birth;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- if (time_is_after_jiffies(birth + min_ttl))
|
||||
- return false;
|
||||
+ VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
|
||||
- /* the size is likely too small to be helpful */
|
||||
- if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||
- return false;
|
||||
- }
|
||||
+ /* see the comment on lru_gen_page */
|
||||
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
|
||||
- if (need_aging)
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||
+ return false;
|
||||
|
||||
- return true;
|
||||
+ if (!lruvec_is_sizable(lruvec, sc))
|
||||
+ return false;
|
||||
+
|
||||
+ mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+
|
||||
+ return !mem_cgroup_below_min(memcg);
|
||||
}
|
||||
|
||||
/* to protect the working set of the last N jiffies */
|
||||
@@ -4283,46 +4301,32 @@ static unsigned long lru_gen_min_ttl __r
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
- bool success = false;
|
||||
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
sc->last_reclaimed = sc->nr_reclaimed;
|
||||
|
||||
- /*
|
||||
- * To reduce the chance of going into the aging path, which can be
|
||||
- * costly, optimistically skip it if the flag below was cleared in the
|
||||
- * eviction path. This improves the overall performance when multiple
|
||||
- * memcgs are available.
|
||||
- */
|
||||
- if (!sc->memcgs_need_aging) {
|
||||
- sc->memcgs_need_aging = true;
|
||||
+ /* check the order to exclude compaction-induced reclaim */
|
||||
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
- }
|
||||
-
|
||||
- set_mm_walk(pgdat);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
- if (age_lruvec(lruvec, sc, min_ttl))
|
||||
- success = true;
|
||||
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
|
||||
+ mem_cgroup_iter_break(NULL, memcg);
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
- clear_mm_walk();
|
||||
-
|
||||
- /* check the order to exclude compaction-induced reclaim */
|
||||
- if (success || !min_ttl || sc->order)
|
||||
- return;
|
||||
-
|
||||
/*
|
||||
* The main goal is to OOM kill if every generation from all memcgs is
|
||||
* younger than min_ttl. However, another possibility is all memcgs are
|
||||
- * either below min or empty.
|
||||
+ * either too small or below min.
|
||||
*/
|
||||
if (mutex_trylock(&oom_lock)) {
|
||||
struct oom_control oc = {
|
||||
@@ -4830,33 +4834,27 @@ retry:
|
||||
* reclaim.
|
||||
*/
|
||||
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap, bool *need_aging)
|
||||
+ bool can_swap)
|
||||
{
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
- DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
if (mem_cgroup_below_min(memcg) ||
|
||||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
return 0;
|
||||
|
||||
- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
- if (!*need_aging)
|
||||
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
|
||||
return nr_to_scan;
|
||||
|
||||
/* skip the aging path at the default priority */
|
||||
if (sc->priority == DEF_PRIORITY)
|
||||
- goto done;
|
||||
+ return nr_to_scan;
|
||||
|
||||
- /* leave the work to lru_gen_age_node() */
|
||||
- if (current_is_kswapd())
|
||||
- return 0;
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
|
||||
|
||||
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||
- return nr_to_scan;
|
||||
-done:
|
||||
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
+ /* skip this lruvec as it's low on cold pages */
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
@@ -4875,9 +4873,7 @@ static unsigned long get_nr_to_reclaim(s
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
- bool need_aging = false;
|
||||
unsigned long scanned = 0;
|
||||
- unsigned long reclaimed = sc->nr_reclaimed;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
lru_add_drain();
|
||||
@@ -4898,13 +4894,13 @@ static void lru_gen_shrink_lruvec(struct
|
||||
else
|
||||
swappiness = 0;
|
||||
|
||||
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
if (!nr_to_scan)
|
||||
- goto done;
|
||||
+ break;
|
||||
|
||||
delta = evict_pages(lruvec, sc, swappiness);
|
||||
if (!delta)
|
||||
- goto done;
|
||||
+ break;
|
||||
|
||||
scanned += delta;
|
||||
if (scanned >= nr_to_scan)
|
||||
@@ -4916,10 +4912,6 @@ static void lru_gen_shrink_lruvec(struct
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
- /* see the comment in lru_gen_age_node() */
|
||||
- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||
- sc->memcgs_need_aging = false;
|
||||
-done:
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
|
@ -0,0 +1,161 @@
|
|||
From 107d54931df3c28d81648122e219bf0034ef4e99 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:03 -0700
|
||||
Subject: [PATCH 25/29] mm: multi-gen LRU: shuffle should_run_aging()
|
||||
|
||||
Move should_run_aging() next to its only caller left.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-6-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 124 ++++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 62 insertions(+), 62 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4183,68 +4183,6 @@ done:
|
||||
return true;
|
||||
}
|
||||
|
||||
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
-{
|
||||
- int gen, type, zone;
|
||||
- unsigned long old = 0;
|
||||
- unsigned long young = 0;
|
||||
- unsigned long total = 0;
|
||||
- struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
- DEFINE_MIN_SEQ(lruvec);
|
||||
-
|
||||
- /* whether this lruvec is completely out of cold pages */
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
- *nr_to_scan = 0;
|
||||
- return true;
|
||||
- }
|
||||
-
|
||||
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
- unsigned long seq;
|
||||
-
|
||||
- for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
- unsigned long size = 0;
|
||||
-
|
||||
- gen = lru_gen_from_seq(seq);
|
||||
-
|
||||
- for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
-
|
||||
- total += size;
|
||||
- if (seq == max_seq)
|
||||
- young += size;
|
||||
- else if (seq + MIN_NR_GENS == max_seq)
|
||||
- old += size;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- /* try to scrape all its memory if this memcg was deleted */
|
||||
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
-
|
||||
- /*
|
||||
- * The aging tries to be lazy to reduce the overhead, while the eviction
|
||||
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
- * ideal number of generations is MIN_NR_GENS+1.
|
||||
- */
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
- return false;
|
||||
-
|
||||
- /*
|
||||
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
|
||||
- * of the total number of pages for each generation. A reasonable range
|
||||
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
|
||||
- * aging cares about the upper bound of hot pages, while the eviction
|
||||
- * cares about the lower bound of cold pages.
|
||||
- */
|
||||
- if (young * MIN_NR_GENS > total)
|
||||
- return true;
|
||||
- if (old * (MIN_NR_GENS + 2) < total)
|
||||
- return true;
|
||||
-
|
||||
- return false;
|
||||
-}
|
||||
-
|
||||
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
int gen, type, zone;
|
||||
@@ -4828,6 +4766,68 @@ retry:
|
||||
return scanned;
|
||||
}
|
||||
|
||||
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ unsigned long old = 0;
|
||||
+ unsigned long young = 0;
|
||||
+ unsigned long total = 0;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ /* whether this lruvec is completely out of cold pages */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
+ *nr_to_scan = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long seq;
|
||||
+
|
||||
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
+ unsigned long size = 0;
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+
|
||||
+ total += size;
|
||||
+ if (seq == max_seq)
|
||||
+ young += size;
|
||||
+ else if (seq + MIN_NR_GENS == max_seq)
|
||||
+ old += size;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* try to scrape all its memory if this memcg was deleted */
|
||||
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
+
|
||||
+ /*
|
||||
+ * The aging tries to be lazy to reduce the overhead, while the eviction
|
||||
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
+ * ideal number of generations is MIN_NR_GENS+1.
|
||||
+ */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
|
||||
+ * of the total number of pages for each generation. A reasonable range
|
||||
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
|
||||
+ * aging cares about the upper bound of hot pages, while the eviction
|
||||
+ * cares about the lower bound of cold pages.
|
||||
+ */
|
||||
+ if (young * MIN_NR_GENS > total)
|
||||
+ return true;
|
||||
+ if (old * (MIN_NR_GENS + 2) < total)
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* For future optimizations:
|
||||
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
|
@ -0,0 +1,868 @@
|
|||
From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:04 -0700
|
||||
Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
|
||||
|
||||
For each node, memcgs are divided into two generations: the old and
|
||||
the young. For each generation, memcgs are randomly sharded into
|
||||
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
|
||||
is virtually divided into three segments: the head, the tail and the
|
||||
default.
|
||||
|
||||
An onlining memcg is added to the tail of a random bin in the old
|
||||
generation. The eviction starts at the head of a random bin in the old
|
||||
generation. The per-node memcg generation counter, whose reminder (mod
|
||||
2) indexes the old generation, is incremented when all its bins become
|
||||
empty.
|
||||
|
||||
There are four operations:
|
||||
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
|
||||
its current generation (old or young) and updates its "seg" to
|
||||
"head";
|
||||
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
|
||||
its current generation (old or young) and updates its "seg" to
|
||||
"tail";
|
||||
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
|
||||
the old generation, updates its "gen" to "old" and resets its "seg"
|
||||
to "default";
|
||||
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
|
||||
in the young generation, updates its "gen" to "young" and resets
|
||||
its "seg" to "default".
|
||||
|
||||
The events that trigger the above operations are:
|
||||
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
|
||||
2. The first attempt to reclaim an memcg below low, which triggers
|
||||
MEMCG_LRU_TAIL;
|
||||
3. The first attempt to reclaim an memcg below reclaimable size
|
||||
threshold, which triggers MEMCG_LRU_TAIL;
|
||||
4. The second attempt to reclaim an memcg below reclaimable size
|
||||
threshold, which triggers MEMCG_LRU_YOUNG;
|
||||
5. Attempting to reclaim an memcg below min, which triggers
|
||||
MEMCG_LRU_YOUNG;
|
||||
6. Finishing the aging on the eviction path, which triggers
|
||||
MEMCG_LRU_YOUNG;
|
||||
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim, and the
|
||||
round-robin incrementing of their max_seq counters ensures the
|
||||
eventual fairness to all eligible memcgs. For memcg reclaim, it still
|
||||
relies on mem_cgroup_iter().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/memcontrol.h | 10 +
|
||||
include/linux/mm_inline.h | 17 ++
|
||||
include/linux/mmzone.h | 117 +++++++++++-
|
||||
mm/memcontrol.c | 16 ++
|
||||
mm/page_alloc.c | 1 +
|
||||
mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++----
|
||||
6 files changed, 499 insertions(+), 35 deletions(-)
|
||||
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct
|
||||
percpu_ref_put(&objcg->refcnt);
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ return !memcg || css_tryget(&memcg->css);
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (memcg)
|
||||
@@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void
|
||||
return current->in_lru_fault;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return READ_ONCE(lruvec->lrugen.seg);
|
||||
+}
|
||||
+#else
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static inline int lru_gen_from_seq(unsigned long seq)
|
||||
{
|
||||
return seq % MAX_NR_GENS;
|
||||
@@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void
|
||||
return false;
|
||||
}
|
||||
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
return false;
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/list.h>
|
||||
+#include <linux/list_nulls.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/cache.h>
|
||||
@@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
|
||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
|
||||
+/* see the comment on MEMCG_NR_GENS */
|
||||
+enum {
|
||||
+ MEMCG_LRU_NOP,
|
||||
+ MEMCG_LRU_HEAD,
|
||||
+ MEMCG_LRU_TAIL,
|
||||
+ MEMCG_LRU_OLD,
|
||||
+ MEMCG_LRU_YOUNG,
|
||||
+};
|
||||
+
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
enum {
|
||||
@@ -416,6 +426,14 @@ struct lru_gen_page {
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
/* whether the multi-gen LRU is enabled */
|
||||
bool enabled;
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ /* the memcg generation this lru_gen_page belongs to */
|
||||
+ u8 gen;
|
||||
+ /* the list segment this lru_gen_page belongs to */
|
||||
+ u8 seg;
|
||||
+ /* per-node lru_gen_page list for global reclaim */
|
||||
+ struct hlist_nulls_node list;
|
||||
+#endif
|
||||
};
|
||||
|
||||
enum {
|
||||
@@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+/*
|
||||
+ * For each node, memcgs are divided into two generations: the old and the
|
||||
+ * young. For each generation, memcgs are randomly sharded into multiple bins
|
||||
+ * to improve scalability. For each bin, the hlist_nulls is virtually divided
|
||||
+ * into three segments: the head, the tail and the default.
|
||||
+ *
|
||||
+ * An onlining memcg is added to the tail of a random bin in the old generation.
|
||||
+ * The eviction starts at the head of a random bin in the old generation. The
|
||||
+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
|
||||
+ * the old generation, is incremented when all its bins become empty.
|
||||
+ *
|
||||
+ * There are four operations:
|
||||
+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
|
||||
+ * current generation (old or young) and updates its "seg" to "head";
|
||||
+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
|
||||
+ * current generation (old or young) and updates its "seg" to "tail";
|
||||
+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
|
||||
+ * generation, updates its "gen" to "old" and resets its "seg" to "default";
|
||||
+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
|
||||
+ * young generation, updates its "gen" to "young" and resets its "seg" to
|
||||
+ * "default".
|
||||
+ *
|
||||
+ * The events that trigger the above operations are:
|
||||
+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
|
||||
+ * 2. The first attempt to reclaim an memcg below low, which triggers
|
||||
+ * MEMCG_LRU_TAIL;
|
||||
+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
|
||||
+ * which triggers MEMCG_LRU_TAIL;
|
||||
+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
|
||||
+ * which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
|
||||
+ *
|
||||
+ * Note that memcg LRU only applies to global reclaim, and the round-robin
|
||||
+ * incrementing of their max_seq counters ensures the eventual fairness to all
|
||||
+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
|
||||
+ */
|
||||
+#define MEMCG_NR_GENS 2
|
||||
+#define MEMCG_NR_BINS 8
|
||||
+
|
||||
+struct lru_gen_memcg {
|
||||
+ /* the per-node memcg generation counter */
|
||||
+ unsigned long seq;
|
||||
+ /* each memcg has one lru_gen_page per node */
|
||||
+ unsigned long nr_memcgs[MEMCG_NR_GENS];
|
||||
+ /* per-node lru_gen_page list for global reclaim */
|
||||
+ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
|
||||
+ /* protects the above */
|
||||
+ spinlock_t lock;
|
||||
+};
|
||||
+
|
||||
+void lru_gen_init_pgdat(struct pglist_data *pgdat);
|
||||
+
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||
-#endif
|
||||
+void lru_gen_online_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_offline_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_release_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
|
||||
+
|
||||
+#else /* !CONFIG_MEMCG */
|
||||
+
|
||||
+#define MEMCG_NR_GENS 1
|
||||
+
|
||||
+struct lru_gen_memcg {
|
||||
+};
|
||||
+
|
||||
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
|
||||
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
}
|
||||
@@ -484,6 +577,7 @@ static inline void lru_gen_look_around(s
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
@@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(st
|
||||
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
@@ -1105,6 +1216,8 @@ typedef struct pglist_data {
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* kswap mm walk data */
|
||||
struct lru_gen_mm_walk mm_walk;
|
||||
+ /* lru_gen_page list */
|
||||
+ struct lru_gen_memcg memcg_lru;
|
||||
#endif
|
||||
|
||||
ZONE_PADDING(_pad2_)
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struc
|
||||
struct mem_cgroup_per_node *mz;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
|
||||
+ if (lru_gen_enabled()) {
|
||||
+ struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
|
||||
+
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
mctz = soft_limit_tree_from_page(page);
|
||||
if (!mctz)
|
||||
return;
|
||||
@@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_recl
|
||||
unsigned long excess;
|
||||
unsigned long nr_scanned;
|
||||
|
||||
+ if (lru_gen_enabled())
|
||||
+ return 0;
|
||||
+
|
||||
if (order > 0)
|
||||
return 0;
|
||||
|
||||
@@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct
|
||||
if (unlikely(mem_cgroup_is_root(memcg)))
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
|
||||
2UL*HZ);
|
||||
+ lru_gen_online_memcg(memcg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struc
|
||||
memcg_offline_kmem(memcg);
|
||||
reparent_shrinker_deferred(memcg);
|
||||
wb_memcg_offline(memcg);
|
||||
+ lru_gen_offline_memcg(memcg);
|
||||
|
||||
drain_all_stock(memcg);
|
||||
|
||||
@@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(stru
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
invalidate_reclaim_iterators(memcg);
|
||||
+ lru_gen_release_memcg(memcg);
|
||||
}
|
||||
|
||||
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
--- a/mm/page_alloc.c
|
||||
+++ b/mm/page_alloc.c
|
||||
@@ -7645,6 +7645,7 @@ static void __init free_area_init_node(i
|
||||
pgdat_set_deferred_range(pgdat);
|
||||
|
||||
free_area_init_core(pgdat);
|
||||
+ lru_gen_init_pgdat(pgdat);
|
||||
}
|
||||
|
||||
void __init free_area_init_memoryless_node(int nid)
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -54,6 +54,8 @@
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/debugfs.h>
|
||||
+#include <linux/rculist_nulls.h>
|
||||
+#include <linux/random.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -129,11 +131,6 @@ struct scan_control {
|
||||
/* Always discard instead of demoting to lower tier memory */
|
||||
unsigned int no_demotion:1;
|
||||
|
||||
-#ifdef CONFIG_LRU_GEN
|
||||
- /* help kswapd make better choices among multiple memcgs */
|
||||
- unsigned long last_reclaimed;
|
||||
-#endif
|
||||
-
|
||||
/* Allocation order */
|
||||
s8 order;
|
||||
|
||||
@@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
|
||||
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
|
||||
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
|
||||
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
|
||||
+
|
||||
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
{
|
||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
@@ -4169,8 +4169,7 @@ done:
|
||||
if (sc->priority <= DEF_PRIORITY - 2)
|
||||
wait_event_killable(lruvec->mm_state.wait,
|
||||
max_seq < READ_ONCE(lrugen->max_seq));
|
||||
-
|
||||
- return max_seq < READ_ONCE(lrugen->max_seq);
|
||||
+ return false;
|
||||
}
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
@@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pgli
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
- sc->last_reclaimed = sc->nr_reclaimed;
|
||||
-
|
||||
/* check the order to exclude compaction-induced reclaim */
|
||||
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
@@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruv
|
||||
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
||||
* reclaim.
|
||||
*/
|
||||
-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap)
|
||||
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
|
||||
{
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
@@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(stru
|
||||
if (sc->priority == DEF_PRIORITY)
|
||||
return nr_to_scan;
|
||||
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
|
||||
-
|
||||
/* skip this lruvec as it's low on cold pages */
|
||||
- return 0;
|
||||
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
|
||||
}
|
||||
|
||||
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
@@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(s
|
||||
if (!global_reclaim(sc))
|
||||
return -1;
|
||||
|
||||
- /* discount the previous progress for kswapd */
|
||||
- if (current_is_kswapd())
|
||||
- return sc->nr_to_reclaim + sc->last_reclaimed;
|
||||
-
|
||||
return max(sc->nr_to_reclaim, compact_gap(sc->order));
|
||||
}
|
||||
|
||||
-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
- struct blk_plug plug;
|
||||
+ long nr_to_scan;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
- lru_add_drain();
|
||||
-
|
||||
- blk_start_plug(&plug);
|
||||
-
|
||||
- set_mm_walk(lruvec_pgdat(lruvec));
|
||||
-
|
||||
while (true) {
|
||||
int delta;
|
||||
int swappiness;
|
||||
- unsigned long nr_to_scan;
|
||||
|
||||
if (sc->may_swap)
|
||||
swappiness = get_swappiness(lruvec, sc);
|
||||
@@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
swappiness = 0;
|
||||
|
||||
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
- if (!nr_to_scan)
|
||||
+ if (nr_to_scan <= 0)
|
||||
break;
|
||||
|
||||
delta = evict_pages(lruvec, sc, swappiness);
|
||||
@@ -4912,10 +4895,250 @@ static void lru_gen_shrink_lruvec(struct
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
+ /* whether try_to_inc_max_seq() was successful */
|
||||
+ return nr_to_scan < 0;
|
||||
+}
|
||||
+
|
||||
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ bool success;
|
||||
+ unsigned long scanned = sc->nr_scanned;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+ int seg = lru_gen_memcg_seg(lruvec);
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (!lruvec_is_sizable(lruvec, sc))
|
||||
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
|
||||
+
|
||||
+ mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+
|
||||
+ if (mem_cgroup_below_min(memcg))
|
||||
+ return MEMCG_LRU_YOUNG;
|
||||
+
|
||||
+ if (mem_cgroup_below_low(memcg)) {
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (seg != MEMCG_LRU_TAIL)
|
||||
+ return MEMCG_LRU_TAIL;
|
||||
+
|
||||
+ memcg_memory_event(memcg, MEMCG_LOW);
|
||||
+ }
|
||||
+
|
||||
+ success = try_to_shrink_lruvec(lruvec, sc);
|
||||
+
|
||||
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
|
||||
+
|
||||
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
|
||||
+ sc->nr_reclaimed - reclaimed);
|
||||
+
|
||||
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
|
||||
+ current->reclaim_state->reclaimed_slab = 0;
|
||||
+
|
||||
+ return success ? MEMCG_LRU_YOUNG : 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int bin;
|
||||
+ int first_bin;
|
||||
+ struct lruvec *lruvec;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
+ const struct hlist_nulls_node *pos;
|
||||
+ int op = 0;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
+
|
||||
+ bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+restart:
|
||||
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
|
||||
+ memcg = lruvec_memcg(lruvec);
|
||||
+
|
||||
+ if (!mem_cgroup_tryget(memcg)) {
|
||||
+ op = 0;
|
||||
+ memcg = NULL;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ op = shrink_one(lruvec, sc);
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ goto success;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ }
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ /* restart if raced with lru_gen_rotate_memcg() */
|
||||
+ if (gen != get_nulls_value(pos))
|
||||
+ goto restart;
|
||||
+
|
||||
+ /* try the rest of the bins of the current generation */
|
||||
+ bin = get_memcg_bin(bin + 1);
|
||||
+ if (bin != first_bin)
|
||||
+ goto restart;
|
||||
+success:
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(global_reclaim(sc));
|
||||
+
|
||||
+ lru_add_drain();
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ set_mm_walk(lruvec_pgdat(lruvec));
|
||||
+
|
||||
+ if (try_to_shrink_lruvec(lruvec, sc))
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
|
||||
+
|
||||
+ clear_mm_walk();
|
||||
+
|
||||
+ blk_finish_plug(&plug);
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_MEMCG */
|
||||
+
|
||||
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ BUILD_BUG();
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ BUILD_BUG();
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ int priority;
|
||||
+ unsigned long reclaimable;
|
||||
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
|
||||
+
|
||||
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
|
||||
+ return;
|
||||
+ /*
|
||||
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
|
||||
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
|
||||
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
|
||||
+ */
|
||||
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
+ if (get_swappiness(lruvec, sc))
|
||||
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
+
|
||||
+ reclaimable /= MEMCG_NR_GENS;
|
||||
+
|
||||
+ /* round down reclaimable and round up sc->nr_to_reclaim */
|
||||
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
|
||||
+
|
||||
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!global_reclaim(sc));
|
||||
+
|
||||
+ lru_add_drain();
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ set_mm_walk(pgdat);
|
||||
+
|
||||
+ set_initial_priority(pgdat, sc);
|
||||
+
|
||||
+ if (current_is_kswapd())
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ if (mem_cgroup_disabled())
|
||||
+ shrink_one(&pgdat->__lruvec, sc);
|
||||
+ else
|
||||
+ shrink_many(pgdat, sc);
|
||||
+
|
||||
+ if (current_is_kswapd())
|
||||
+ sc->nr_reclaimed += reclaimed;
|
||||
+
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
+
|
||||
+ /* kswapd should never fail */
|
||||
+ pgdat->kswapd_failures = 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||
+{
|
||||
+ int seg;
|
||||
+ int old, new;
|
||||
+ int bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ seg = 0;
|
||||
+ new = old = lruvec->lrugen.gen;
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (op == MEMCG_LRU_HEAD)
|
||||
+ seg = MEMCG_LRU_HEAD;
|
||||
+ else if (op == MEMCG_LRU_TAIL)
|
||||
+ seg = MEMCG_LRU_TAIL;
|
||||
+ else if (op == MEMCG_LRU_OLD)
|
||||
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||
+ else if (op == MEMCG_LRU_YOUNG)
|
||||
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
|
||||
+ else
|
||||
+ VM_WARN_ON_ONCE(true);
|
||||
+
|
||||
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||
+
|
||||
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
|
||||
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||
+ else
|
||||
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||
+
|
||||
+ pgdat->memcg_lru.nr_memcgs[old]--;
|
||||
+ pgdat->memcg_lru.nr_memcgs[new]++;
|
||||
+
|
||||
+ lruvec->lrugen.gen = new;
|
||||
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
|
||||
+
|
||||
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
}
|
||||
+#endif
|
||||
|
||||
/******************************************************************************
|
||||
* state change
|
||||
@@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_i
|
||||
|
||||
if (!mem_cgroup_disabled()) {
|
||||
rcu_read_lock();
|
||||
+
|
||||
memcg = mem_cgroup_from_id(memcg_id);
|
||||
-#ifdef CONFIG_MEMCG
|
||||
- if (memcg && !css_tryget(&memcg->css))
|
||||
+ if (!mem_cgroup_tryget(memcg))
|
||||
memcg = NULL;
|
||||
-#endif
|
||||
+
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!memcg)
|
||||
@@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+ int i, j;
|
||||
+
|
||||
+ spin_lock_init(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
|
||||
+ for (j = 0; j < MEMCG_NR_BINS; j++)
|
||||
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
INIT_LIST_HEAD(&memcg->mm_list.fifo);
|
||||
@@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
|
||||
}
|
||||
}
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int nid;
|
||||
+ int bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||
+
|
||||
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
|
||||
+ pgdat->memcg_lru.nr_memcgs[gen]++;
|
||||
+
|
||||
+ lruvec->lrugen.gen = gen;
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ gen = lruvec->lrugen.gen;
|
||||
+
|
||||
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||
+ pgdat->memcg_lru.nr_memcgs[gen]--;
|
||||
+
|
||||
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
static int __init init_lru_gen(void)
|
||||
{
|
||||
@@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct
|
||||
{
|
||||
}
|
||||
|
||||
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
@@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec
|
||||
bool proportional_reclaim;
|
||||
struct blk_plug plug;
|
||||
|
||||
- if (lru_gen_enabled()) {
|
||||
+ if (lru_gen_enabled() && !global_reclaim(sc)) {
|
||||
lru_gen_shrink_lruvec(lruvec, sc);
|
||||
return;
|
||||
}
|
||||
@@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
|
||||
+ if (lru_gen_enabled() && global_reclaim(sc)) {
|
||||
+ lru_gen_shrink_node(pgdat, sc);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
again:
|
|
@ -0,0 +1,196 @@
|
|||
From 93147736b5b3a21bea24313bfc7a696829932009 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:05 -0700
|
||||
Subject: [PATCH 27/29] mm: multi-gen LRU: clarify scan_control flags
|
||||
|
||||
Among the flags in scan_control:
|
||||
1. sc->may_swap, which indicates swap constraint due to memsw.max, is
|
||||
supported as usual.
|
||||
2. sc->proactive, which indicates reclaim by memory.reclaim, may not
|
||||
opportunistically skip the aging path, since it is considered less
|
||||
latency sensitive.
|
||||
3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers
|
||||
swappiness to prioritize file LRU, since clean file pages are more
|
||||
likely to exist.
|
||||
4. sc->may_writepage and sc->may_unmap, which indicates opportunistic
|
||||
reclaim, are rejected, since unmapped clean pages are already
|
||||
prioritized. Scanning for more of them is likely futile and can
|
||||
cause high reclaim latency when there is a large number of memcgs.
|
||||
|
||||
The rest are handled by the existing code.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-8-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 55 +++++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 28 insertions(+), 27 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2905,6 +2905,9 @@ static int get_swappiness(struct lruvec
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
+ if (!sc->may_swap)
|
||||
+ return 0;
|
||||
+
|
||||
if (!can_demote(pgdat->node_id, sc) &&
|
||||
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
|
||||
return 0;
|
||||
@@ -3952,7 +3955,7 @@ static void walk_mm(struct lruvec *lruve
|
||||
} while (err == -EAGAIN);
|
||||
}
|
||||
|
||||
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
|
||||
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
|
||||
{
|
||||
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
|
||||
|
||||
@@ -3960,7 +3963,7 @@ static struct lru_gen_mm_walk *set_mm_wa
|
||||
VM_WARN_ON_ONCE(walk);
|
||||
|
||||
walk = &pgdat->mm_walk;
|
||||
- } else if (!pgdat && !walk) {
|
||||
+ } else if (!walk && force_alloc) {
|
||||
VM_WARN_ON_ONCE(current_is_kswapd());
|
||||
|
||||
walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
||||
@@ -4146,7 +4149,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
goto done;
|
||||
}
|
||||
|
||||
- walk = set_mm_walk(NULL);
|
||||
+ walk = set_mm_walk(NULL, true);
|
||||
if (!walk) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
@@ -4215,8 +4218,6 @@ static bool lruvec_is_reclaimable(struct
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
-
|
||||
/* see the comment on lru_gen_page */
|
||||
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
@@ -4472,12 +4473,8 @@ static bool isolate_page(struct lruvec *
|
||||
{
|
||||
bool success;
|
||||
|
||||
- /* unmapping inhibited */
|
||||
- if (!sc->may_unmap && page_mapped(page))
|
||||
- return false;
|
||||
-
|
||||
/* swapping inhibited */
|
||||
- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
|
||||
+ if (!(sc->gfp_mask & __GFP_IO) &&
|
||||
(PageDirty(page) ||
|
||||
(PageAnon(page) && !PageSwapCache(page))))
|
||||
return false;
|
||||
@@ -4574,9 +4571,8 @@ static int scan_pages(struct lruvec *lru
|
||||
__count_vm_events(PGSCAN_ANON + type, isolated);
|
||||
|
||||
/*
|
||||
- * There might not be eligible pages due to reclaim_idx, may_unmap and
|
||||
- * may_writepage. Check the remaining to prevent livelock if it's not
|
||||
- * making progress.
|
||||
+ * There might not be eligible pages due to reclaim_idx. Check the
|
||||
+ * remaining to prevent livelock if it's not making progress.
|
||||
*/
|
||||
return isolated || !remaining ? scanned : 0;
|
||||
}
|
||||
@@ -4836,8 +4832,7 @@ static long get_nr_to_scan(struct lruvec
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
|
||||
- if (mem_cgroup_below_min(memcg) ||
|
||||
- (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
+ if (mem_cgroup_below_min(memcg))
|
||||
return 0;
|
||||
|
||||
if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
|
||||
@@ -4865,17 +4860,14 @@ static bool try_to_shrink_lruvec(struct
|
||||
long nr_to_scan;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
+ int swappiness = get_swappiness(lruvec, sc);
|
||||
+
|
||||
+ /* clean file pages are more likely to exist */
|
||||
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
|
||||
+ swappiness = 1;
|
||||
|
||||
while (true) {
|
||||
int delta;
|
||||
- int swappiness;
|
||||
-
|
||||
- if (sc->may_swap)
|
||||
- swappiness = get_swappiness(lruvec, sc);
|
||||
- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
|
||||
- swappiness = 1;
|
||||
- else
|
||||
- swappiness = 0;
|
||||
|
||||
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
if (nr_to_scan <= 0)
|
||||
@@ -5005,12 +4997,13 @@ static void lru_gen_shrink_lruvec(struct
|
||||
struct blk_plug plug;
|
||||
|
||||
VM_WARN_ON_ONCE(global_reclaim(sc));
|
||||
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
- set_mm_walk(lruvec_pgdat(lruvec));
|
||||
+ set_mm_walk(NULL, false);
|
||||
|
||||
if (try_to_shrink_lruvec(lruvec, sc))
|
||||
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
|
||||
@@ -5066,11 +5059,19 @@ static void lru_gen_shrink_node(struct p
|
||||
|
||||
VM_WARN_ON_ONCE(!global_reclaim(sc));
|
||||
|
||||
+ /*
|
||||
+ * Unmapped clean pages are already prioritized. Scanning for more of
|
||||
+ * them is likely futile and can cause high reclaim latency when there
|
||||
+ * is a large number of memcgs.
|
||||
+ */
|
||||
+ if (!sc->may_writepage || !sc->may_unmap)
|
||||
+ goto done;
|
||||
+
|
||||
lru_add_drain();
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
- set_mm_walk(pgdat);
|
||||
+ set_mm_walk(pgdat, false);
|
||||
|
||||
set_initial_priority(pgdat, sc);
|
||||
|
||||
@@ -5088,7 +5089,7 @@ static void lru_gen_shrink_node(struct p
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
-
|
||||
+done:
|
||||
/* kswapd should never fail */
|
||||
pgdat->kswapd_failures = 0;
|
||||
}
|
||||
@@ -5656,7 +5657,7 @@ static ssize_t lru_gen_seq_write(struct
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
flags = memalloc_noreclaim_save();
|
||||
blk_start_plug(&plug);
|
||||
- if (!set_mm_walk(NULL)) {
|
||||
+ if (!set_mm_walk(NULL, true)) {
|
||||
err = -ENOMEM;
|
||||
goto done;
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
From cf3297e4c7a928da8b2b2f0baff2f9c69ea57952 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:06 -0700
|
||||
Subject: [PATCH 28/29] mm: multi-gen LRU: simplify arch_has_hw_pte_young()
|
||||
check
|
||||
|
||||
Scanning page tables when hardware does not set the accessed bit has
|
||||
no real use cases.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-9-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4144,7 +4144,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
From cc67f962cc53f6e1dfa92eb85b7b26fe83a3c66f Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 13 Feb 2023 00:53:22 -0700
|
||||
Subject: [PATCH 29/29] mm: multi-gen LRU: avoid futile retries
|
||||
|
||||
Recall that the per-node memcg LRU has two generations and they alternate
|
||||
when the last memcg (of a given node) is moved from one to the other.
|
||||
Each generation is also sharded into multiple bins to improve scalability.
|
||||
A reclaimer starts with a random bin (in the old generation) and, if it
|
||||
fails, it will retry, i.e., to try the rest of the bins.
|
||||
|
||||
If a reclaimer fails with the last memcg, it should move this memcg to the
|
||||
young generation first, which causes the generations to alternate, and
|
||||
then retry. Otherwise, the retries will be futile because all other bins
|
||||
are empty.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20230213075322.1416966-1-yuzhao@google.com
|
||||
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: T.J. Mercier <tjmercier@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 25 +++++++++++++++----------
|
||||
1 file changed, 15 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4934,18 +4934,20 @@ static int shrink_one(struct lruvec *lru
|
||||
|
||||
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
+ int op;
|
||||
int gen;
|
||||
int bin;
|
||||
int first_bin;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_page *lrugen;
|
||||
+ struct mem_cgroup *memcg;
|
||||
const struct hlist_nulls_node *pos;
|
||||
- int op = 0;
|
||||
- struct mem_cgroup *memcg = NULL;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
restart:
|
||||
+ op = 0;
|
||||
+ memcg = NULL;
|
||||
gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -4969,14 +4971,22 @@ restart:
|
||||
|
||||
op = shrink_one(lruvec, sc);
|
||||
|
||||
- if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
- goto success;
|
||||
-
|
||||
rcu_read_lock();
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ break;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ return;
|
||||
+
|
||||
/* restart if raced with lru_gen_rotate_memcg() */
|
||||
if (gen != get_nulls_value(pos))
|
||||
goto restart;
|
||||
@@ -4985,11 +4995,6 @@ restart:
|
||||
bin = get_memcg_bin(bin + 1);
|
||||
if (bin != first_bin)
|
||||
goto restart;
|
||||
-success:
|
||||
- if (op)
|
||||
- lru_gen_rotate_memcg(lruvec, op);
|
||||
-
|
||||
- mem_cgroup_put(memcg);
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
@ -1,95 +0,0 @@
|
|||
From 598ada195606eb0e577db0487dd59a2536f206ce Mon Sep 17 00:00:00 2001
|
||||
From: Andres Freund <andres@anarazel.de>
|
||||
Date: Sun, 31 Jul 2022 18:38:27 -0700
|
||||
Subject: [PATCH 1/5] tools build: Add feature test for init_disassemble_info
|
||||
API changes
|
||||
|
||||
binutils changed the signature of init_disassemble_info(), which now causes
|
||||
compilation failures for tools/{perf,bpf}, e.g. on debian unstable.
|
||||
|
||||
Relevant binutils commit:
|
||||
|
||||
https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=60a3da00bd5407f07
|
||||
|
||||
This commit adds a feature test to detect the new signature. Subsequent
|
||||
commits will use it to fix the build failures.
|
||||
|
||||
Signed-off-by: Andres Freund <andres@anarazel.de>
|
||||
Acked-by: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Alexei Starovoitov <ast@kernel.org>
|
||||
Cc: Ben Hutchings <benh@debian.org>
|
||||
Cc: Jiri Olsa <jolsa@kernel.org>
|
||||
Cc: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Cc: bpf@vger.kernel.org
|
||||
Link: http://lore.kernel.org/lkml/20220622181918.ykrs5rsnmx3og4sv@alap3.anarazel.de
|
||||
Link: https://lore.kernel.org/r/20220801013834.156015-2-andres@anarazel.de
|
||||
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
|
||||
(cherry picked from commit cfd59ca91467056bb2c36907b2fa67b8e1af9952)
|
||||
---
|
||||
tools/build/Makefile.feature | 1 +
|
||||
tools/build/feature/Makefile | 4 ++++
|
||||
tools/build/feature/test-all.c | 4 ++++
|
||||
tools/build/feature/test-disassembler-init-styled.c | 13 +++++++++++++
|
||||
4 files changed, 22 insertions(+)
|
||||
create mode 100644 tools/build/feature/test-disassembler-init-styled.c
|
||||
|
||||
--- a/tools/build/Makefile.feature
|
||||
+++ b/tools/build/Makefile.feature
|
||||
@@ -69,6 +69,7 @@ FEATURE_TESTS_BASIC :=
|
||||
libaio \
|
||||
libzstd \
|
||||
disassembler-four-args \
|
||||
+ disassembler-init-styled \
|
||||
file-handle
|
||||
|
||||
# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
|
||||
--- a/tools/build/feature/Makefile
|
||||
+++ b/tools/build/feature/Makefile
|
||||
@@ -18,6 +18,7 @@ FILES=
|
||||
test-libbfd.bin \
|
||||
test-libbfd-buildid.bin \
|
||||
test-disassembler-four-args.bin \
|
||||
+ test-disassembler-init-styled.bin \
|
||||
test-reallocarray.bin \
|
||||
test-libbfd-liberty.bin \
|
||||
test-libbfd-liberty-z.bin \
|
||||
@@ -239,6 +240,9 @@ $(OUTPUT)test-libbfd-buildid.bin:
|
||||
$(OUTPUT)test-disassembler-four-args.bin:
|
||||
$(BUILD) -DPACKAGE='"perf"' -lbfd -lopcodes
|
||||
|
||||
+$(OUTPUT)test-disassembler-init-styled.bin:
|
||||
+ $(BUILD) -DPACKAGE='"perf"' -lbfd -lopcodes
|
||||
+
|
||||
$(OUTPUT)test-reallocarray.bin:
|
||||
$(BUILD)
|
||||
|
||||
--- a/tools/build/feature/test-all.c
|
||||
+++ b/tools/build/feature/test-all.c
|
||||
@@ -166,6 +166,10 @@
|
||||
# include "test-disassembler-four-args.c"
|
||||
#undef main
|
||||
|
||||
+#define main main_test_disassembler_init_styled
|
||||
+# include "test-disassembler-init-styled.c"
|
||||
+#undef main
|
||||
+
|
||||
#define main main_test_libzstd
|
||||
# include "test-libzstd.c"
|
||||
#undef main
|
||||
--- /dev/null
|
||||
+++ b/tools/build/feature/test-disassembler-init-styled.c
|
||||
@@ -0,0 +1,13 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0
|
||||
+#include <stdio.h>
|
||||
+#include <dis-asm.h>
|
||||
+
|
||||
+int main(void)
|
||||
+{
|
||||
+ struct disassemble_info info;
|
||||
+
|
||||
+ init_disassemble_info(&info, stdout,
|
||||
+ NULL, NULL);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
|
@ -1,96 +0,0 @@
|
|||
From 08ec5766e5cf7b24fdebefb83b6f760bceeddf40 Mon Sep 17 00:00:00 2001
|
||||
From: Andres Freund <andres@anarazel.de>
|
||||
Date: Sun, 31 Jul 2022 18:38:29 -0700
|
||||
Subject: [PATCH 2/5] tools include: add dis-asm-compat.h to handle version
|
||||
differences
|
||||
|
||||
binutils changed the signature of init_disassemble_info(), which now causes
|
||||
compilation failures for tools/{perf,bpf}, e.g. on debian unstable.
|
||||
|
||||
Relevant binutils commit:
|
||||
|
||||
https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=60a3da00bd5407f07
|
||||
|
||||
This commit introduces a wrapper for init_disassemble_info(), to avoid
|
||||
spreading #ifdef DISASM_INIT_STYLED to a bunch of places. Subsequent
|
||||
commits will use it to fix the build failures.
|
||||
|
||||
It likely is worth adding a wrapper for disassember(), to avoid the already
|
||||
existing DISASM_FOUR_ARGS_SIGNATURE ifdefery.
|
||||
|
||||
Signed-off-by: Andres Freund <andres@anarazel.de>
|
||||
Signed-off-by: Ben Hutchings <benh@debian.org>
|
||||
Acked-by: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Alexei Starovoitov <ast@kernel.org>
|
||||
Cc: Ben Hutchings <benh@debian.org>
|
||||
Cc: Jiri Olsa <jolsa@kernel.org>
|
||||
Cc: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Cc: bpf@vger.kernel.org
|
||||
Link: http://lore.kernel.org/lkml/20220622181918.ykrs5rsnmx3og4sv@alap3.anarazel.de
|
||||
Link: https://lore.kernel.org/r/20220801013834.156015-4-andres@anarazel.de
|
||||
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
|
||||
(cherry picked from commit a45b3d6926231c3d024ea0de4f7bd967f83709ee)
|
||||
---
|
||||
tools/include/tools/dis-asm-compat.h | 55 ++++++++++++++++++++++++++++
|
||||
1 file changed, 55 insertions(+)
|
||||
create mode 100644 tools/include/tools/dis-asm-compat.h
|
||||
|
||||
--- /dev/null
|
||||
+++ b/tools/include/tools/dis-asm-compat.h
|
||||
@@ -0,0 +1,55 @@
|
||||
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
|
||||
+#ifndef _TOOLS_DIS_ASM_COMPAT_H
|
||||
+#define _TOOLS_DIS_ASM_COMPAT_H
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <dis-asm.h>
|
||||
+
|
||||
+/* define types for older binutils version, to centralize ifdef'ery a bit */
|
||||
+#ifndef DISASM_INIT_STYLED
|
||||
+enum disassembler_style {DISASSEMBLER_STYLE_NOT_EMPTY};
|
||||
+typedef int (*fprintf_styled_ftype) (void *, enum disassembler_style, const char*, ...);
|
||||
+#endif
|
||||
+
|
||||
+/*
|
||||
+ * Trivial fprintf wrapper to be used as the fprintf_styled_func argument to
|
||||
+ * init_disassemble_info_compat() when normal fprintf suffices.
|
||||
+ */
|
||||
+static inline int fprintf_styled(void *out,
|
||||
+ enum disassembler_style style,
|
||||
+ const char *fmt, ...)
|
||||
+{
|
||||
+ va_list args;
|
||||
+ int r;
|
||||
+
|
||||
+ (void)style;
|
||||
+
|
||||
+ va_start(args, fmt);
|
||||
+ r = vfprintf(out, fmt, args);
|
||||
+ va_end(args);
|
||||
+
|
||||
+ return r;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Wrapper for init_disassemble_info() that hides version
|
||||
+ * differences. Depending on binutils version and architecture either
|
||||
+ * fprintf_func or fprintf_styled_func will be called.
|
||||
+ */
|
||||
+static inline void init_disassemble_info_compat(struct disassemble_info *info,
|
||||
+ void *stream,
|
||||
+ fprintf_ftype unstyled_func,
|
||||
+ fprintf_styled_ftype styled_func)
|
||||
+{
|
||||
+#ifdef DISASM_INIT_STYLED
|
||||
+ init_disassemble_info(info, stream,
|
||||
+ unstyled_func,
|
||||
+ styled_func);
|
||||
+#else
|
||||
+ (void)styled_func;
|
||||
+ init_disassemble_info(info, stream,
|
||||
+ unstyled_func);
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+#endif /* _TOOLS_DIS_ASM_COMPAT_H */
|
|
@ -1,111 +0,0 @@
|
|||
From 3bc373152a3a00742750dbbe974d541af78231e6 Mon Sep 17 00:00:00 2001
|
||||
From: Andres Freund <andres@anarazel.de>
|
||||
Date: Sun, 31 Jul 2022 18:38:30 -0700
|
||||
Subject: [PATCH 3/5] tools perf: Fix compilation error with new binutils
|
||||
|
||||
binutils changed the signature of init_disassemble_info(), which now causes
|
||||
compilation failures for tools/perf/util/annotate.c, e.g. on debian
|
||||
unstable.
|
||||
|
||||
Relevant binutils commit:
|
||||
|
||||
https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=60a3da00bd5407f07
|
||||
|
||||
Wire up the feature test and switch to init_disassemble_info_compat(),
|
||||
which were introduced in prior commits, fixing the compilation failure.
|
||||
|
||||
I verified that perf can still disassemble bpf programs by using bpftrace
|
||||
under load, recording a perf trace, and then annotating the bpf "function"
|
||||
with and without the changes. With old binutils there's no change in output
|
||||
before/after this patch. When comparing the output from old binutils (2.35)
|
||||
to new bintuils with the patch (upstream snapshot) there are a few output
|
||||
differences, but they are unrelated to this patch. An example hunk is:
|
||||
|
||||
1.15 : 55:mov %rbp,%rdx
|
||||
0.00 : 58:add $0xfffffffffffffff8,%rdx
|
||||
0.00 : 5c:xor %ecx,%ecx
|
||||
- 1.03 : 5e:callq 0xffffffffe12aca3c
|
||||
+ 1.03 : 5e:call 0xffffffffe12aca3c
|
||||
0.00 : 63:xor %eax,%eax
|
||||
- 2.18 : 65:leaveq
|
||||
- 2.82 : 66:retq
|
||||
+ 2.18 : 65:leave
|
||||
+ 2.82 : 66:ret
|
||||
|
||||
Signed-off-by: Andres Freund <andres@anarazel.de>
|
||||
Acked-by: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Alexei Starovoitov <ast@kernel.org>
|
||||
Cc: Ben Hutchings <benh@debian.org>
|
||||
Cc: Jiri Olsa <jolsa@kernel.org>
|
||||
Cc: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Cc: bpf@vger.kernel.org
|
||||
Link: http://lore.kernel.org/lkml/20220622181918.ykrs5rsnmx3og4sv@alap3.anarazel.de
|
||||
Link: https://lore.kernel.org/r/20220801013834.156015-5-andres@anarazel.de
|
||||
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
|
||||
(cherry picked from commit 83aa0120487e8bc3f231e72c460add783f71f17c)
|
||||
---
|
||||
tools/perf/Makefile.config | 8 ++++++++
|
||||
tools/perf/util/annotate.c | 7 ++++---
|
||||
2 files changed, 12 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/tools/perf/Makefile.config
|
||||
+++ b/tools/perf/Makefile.config
|
||||
@@ -296,6 +296,7 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYT
|
||||
FEATURE_CHECK_LDFLAGS-libaio = -lrt
|
||||
|
||||
FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
|
||||
+FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
|
||||
|
||||
CORE_CFLAGS += -fno-omit-frame-pointer
|
||||
CORE_CFLAGS += -ggdb3
|
||||
@@ -872,13 +873,16 @@ ifndef NO_LIBBFD
|
||||
ifeq ($(feature-libbfd-liberty), 1)
|
||||
EXTLIBS += -lbfd -lopcodes -liberty
|
||||
FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -ldl
|
||||
+ FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -ldl
|
||||
else
|
||||
ifeq ($(feature-libbfd-liberty-z), 1)
|
||||
EXTLIBS += -lbfd -lopcodes -liberty -lz
|
||||
FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -lz -ldl
|
||||
+ FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -lz -ldl
|
||||
endif
|
||||
endif
|
||||
$(call feature_check,disassembler-four-args)
|
||||
+ $(call feature_check,disassembler-init-styled)
|
||||
endif
|
||||
|
||||
ifeq ($(feature-libbfd-buildid), 1)
|
||||
@@ -992,6 +996,10 @@ ifeq ($(feature-disassembler-four-args),
|
||||
CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
|
||||
endif
|
||||
|
||||
+ifeq ($(feature-disassembler-init-styled), 1)
|
||||
+ CFLAGS += -DDISASM_INIT_STYLED
|
||||
+endif
|
||||
+
|
||||
ifeq (${IS_64_BIT}, 1)
|
||||
ifndef NO_PERF_READ_VDSO32
|
||||
$(call feature_check,compile-32)
|
||||
--- a/tools/perf/util/annotate.c
|
||||
+++ b/tools/perf/util/annotate.c
|
||||
@@ -1694,6 +1694,7 @@ fallback:
|
||||
#include <bpf/btf.h>
|
||||
#include <bpf/libbpf.h>
|
||||
#include <linux/btf.h>
|
||||
+#include <tools/dis-asm-compat.h>
|
||||
|
||||
static int symbol__disassemble_bpf(struct symbol *sym,
|
||||
struct annotate_args *args)
|
||||
@@ -1736,9 +1737,9 @@ static int symbol__disassemble_bpf(struc
|
||||
ret = errno;
|
||||
goto out;
|
||||
}
|
||||
- init_disassemble_info(&info, s,
|
||||
- (fprintf_ftype) fprintf);
|
||||
-
|
||||
+ init_disassemble_info_compat(&info, s,
|
||||
+ (fprintf_ftype) fprintf,
|
||||
+ fprintf_styled);
|
||||
info.arch = bfd_get_arch(bfdf);
|
||||
info.mach = bfd_get_mach(bfdf);
|
||||
|
|
@ -1,102 +0,0 @@
|
|||
From 042e7f11769adac0736d77d76262912b90724d7d Mon Sep 17 00:00:00 2001
|
||||
From: Andres Freund <andres@anarazel.de>
|
||||
Date: Sun, 31 Jul 2022 18:38:31 -0700
|
||||
Subject: [PATCH 4/5] tools bpf_jit_disasm: Fix compilation error with new
|
||||
binutils
|
||||
|
||||
binutils changed the signature of init_disassemble_info(), which now causes
|
||||
compilation to fail for tools/bpf/bpf_jit_disasm.c, e.g. on debian
|
||||
unstable.
|
||||
|
||||
Relevant binutils commit:
|
||||
|
||||
https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=60a3da00bd5407f07
|
||||
|
||||
Wire up the feature test and switch to init_disassemble_info_compat(),
|
||||
which were introduced in prior commits, fixing the compilation failure.
|
||||
|
||||
I verified that bpf_jit_disasm can still disassemble bpf programs, both
|
||||
with the old and new dis-asm.h API. With old binutils there's no change in
|
||||
output before/after this patch. When comparing the output from old
|
||||
binutils (2.35) to new bintuils with the patch (upstream snapshot) there
|
||||
are a few output differences, but they are unrelated to this patch. An
|
||||
example hunk is:
|
||||
|
||||
f4: mov %r14,%rsi
|
||||
f7: mov %r15,%rdx
|
||||
fa: mov $0x2a,%ecx
|
||||
- ff: callq 0xffffffffea8c4988
|
||||
+ ff: call 0xffffffffea8c4988
|
||||
104: test %rax,%rax
|
||||
107: jge 0x0000000000000110
|
||||
109: xor %eax,%eax
|
||||
- 10b: jmpq 0x0000000000000073
|
||||
+ 10b: jmp 0x0000000000000073
|
||||
110: cmp $0x16,%rax
|
||||
|
||||
However, I had to use an older kernel to generate the bpf_jit_enabled =
|
||||
2 output, as that has been broken since 5.18 / 1022a5498f6f745c ("bpf,
|
||||
x86_64: Use bpf_jit_binary_pack_alloc").
|
||||
|
||||
https://lore.kernel.org/20220703030210.pmjft7qc2eajzi6c@alap3.anarazel.de
|
||||
|
||||
Signed-off-by: Andres Freund <andres@anarazel.de>
|
||||
Acked-by: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Alexei Starovoitov <ast@kernel.org>
|
||||
Cc: Ben Hutchings <benh@debian.org>
|
||||
Cc: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Cc: Jiri Olsa <jolsa@kernel.org>
|
||||
Cc: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Cc: bpf@vger.kernel.org
|
||||
Link: http://lore.kernel.org/lkml/20220622181918.ykrs5rsnmx3og4sv@alap3.anarazel.de
|
||||
Link: https://lore.kernel.org/r/20220801013834.156015-6-andres@anarazel.de
|
||||
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
|
||||
(cherry picked from commit 96ed066054abf11c7d3e106e3011a51f3f1227a3)
|
||||
---
|
||||
tools/bpf/Makefile | 5 ++++-
|
||||
tools/bpf/bpf_jit_disasm.c | 5 ++++-
|
||||
2 files changed, 8 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/tools/bpf/Makefile
|
||||
+++ b/tools/bpf/Makefile
|
||||
@@ -34,7 +34,7 @@ else
|
||||
endif
|
||||
|
||||
FEATURE_USER = .bpf
|
||||
-FEATURE_TESTS = libbfd disassembler-four-args
|
||||
+FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled
|
||||
FEATURE_DISPLAY = libbfd disassembler-four-args
|
||||
|
||||
check_feat := 1
|
||||
@@ -56,6 +56,9 @@ endif
|
||||
ifeq ($(feature-disassembler-four-args), 1)
|
||||
CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
|
||||
endif
|
||||
+ifeq ($(feature-disassembler-init-styled), 1)
|
||||
+CFLAGS += -DDISASM_INIT_STYLED
|
||||
+endif
|
||||
|
||||
$(OUTPUT)%.yacc.c: $(srctree)/tools/bpf/%.y
|
||||
$(QUIET_BISON)$(YACC) -o $@ -d $<
|
||||
--- a/tools/bpf/bpf_jit_disasm.c
|
||||
+++ b/tools/bpf/bpf_jit_disasm.c
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <limits.h>
|
||||
+#include <tools/dis-asm-compat.h>
|
||||
|
||||
#define CMD_ACTION_SIZE_BUFFER 10
|
||||
#define CMD_ACTION_READ_ALL 3
|
||||
@@ -64,7 +65,9 @@ static void get_asm_insns(uint8_t *image
|
||||
assert(bfdf);
|
||||
assert(bfd_check_format(bfdf, bfd_object));
|
||||
|
||||
- init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf);
|
||||
+ init_disassemble_info_compat(&info, stdout,
|
||||
+ (fprintf_ftype) fprintf,
|
||||
+ fprintf_styled);
|
||||
info.arch = bfd_get_arch(bfdf);
|
||||
info.mach = bfd_get_mach(bfdf);
|
||||
info.buffer = image;
|
|
@ -1,146 +0,0 @@
|
|||
From a82db18ab34ba7f9d38319e8cc01ffe382e3e55e Mon Sep 17 00:00:00 2001
|
||||
From: Andres Freund <andres@anarazel.de>
|
||||
Date: Sun, 31 Jul 2022 18:38:33 -0700
|
||||
Subject: [PATCH 5/5] tools bpftool: Fix compilation error with new binutils
|
||||
|
||||
binutils changed the signature of init_disassemble_info(), which now causes
|
||||
compilation to fail for tools/bpf/bpftool/jit_disasm.c, e.g. on debian
|
||||
unstable.
|
||||
|
||||
Relevant binutils commit:
|
||||
|
||||
https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=60a3da00bd5407f07
|
||||
|
||||
Wire up the feature test and switch to init_disassemble_info_compat(),
|
||||
which were introduced in prior commits, fixing the compilation failure.
|
||||
|
||||
I verified that bpftool can still disassemble bpf programs, both with an
|
||||
old and new dis-asm.h API. There are no output changes for plain and json
|
||||
formats. When comparing the output from old binutils (2.35)
|
||||
to new bintuils with the patch (upstream snapshot) there are a few output
|
||||
differences, but they are unrelated to this patch. An example hunk is:
|
||||
|
||||
2f: pop %r14
|
||||
31: pop %r13
|
||||
33: pop %rbx
|
||||
- 34: leaveq
|
||||
- 35: retq
|
||||
+ 34: leave
|
||||
+ 35: ret
|
||||
|
||||
Signed-off-by: Andres Freund <andres@anarazel.de>
|
||||
Acked-by: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Alexei Starovoitov <ast@kernel.org>
|
||||
Cc: Ben Hutchings <benh@debian.org>
|
||||
Cc: Jiri Olsa <jolsa@kernel.org>
|
||||
Cc: Quentin Monnet <quentin@isovalent.com>
|
||||
Cc: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Cc: bpf@vger.kernel.org
|
||||
Link: http://lore.kernel.org/lkml/20220622181918.ykrs5rsnmx3og4sv@alap3.anarazel.de
|
||||
Link: https://lore.kernel.org/r/20220801013834.156015-8-andres@anarazel.de
|
||||
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
|
||||
(cherry picked from commit 600b7b26c07a070d0153daa76b3806c1e52c9e00)
|
||||
---
|
||||
tools/bpf/bpftool/Makefile | 5 +++-
|
||||
tools/bpf/bpftool/jit_disasm.c | 42 +++++++++++++++++++++++++++-------
|
||||
2 files changed, 38 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/tools/bpf/bpftool/Makefile
|
||||
+++ b/tools/bpf/bpftool/Makefile
|
||||
@@ -76,7 +76,7 @@ INSTALL ?= install
|
||||
RM ?= rm -f
|
||||
|
||||
FEATURE_USER = .bpftool
|
||||
-FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libcap \
|
||||
+FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled reallocarray zlib libcap \
|
||||
clang-bpf-co-re
|
||||
FEATURE_DISPLAY = libbfd disassembler-four-args zlib libcap \
|
||||
clang-bpf-co-re
|
||||
@@ -100,6 +100,9 @@ endif
|
||||
ifeq ($(feature-disassembler-four-args), 1)
|
||||
CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
|
||||
endif
|
||||
+ifeq ($(feature-disassembler-init-styled), 1)
|
||||
+ CFLAGS += -DDISASM_INIT_STYLED
|
||||
+endif
|
||||
|
||||
ifeq ($(feature-reallocarray), 0)
|
||||
CFLAGS += -DCOMPAT_NEED_REALLOCARRAY
|
||||
--- a/tools/bpf/bpftool/jit_disasm.c
|
||||
+++ b/tools/bpf/bpftool/jit_disasm.c
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <sys/stat.h>
|
||||
#include <limits.h>
|
||||
#include <bpf/libbpf.h>
|
||||
+#include <tools/dis-asm-compat.h>
|
||||
|
||||
#include "json_writer.h"
|
||||
#include "main.h"
|
||||
@@ -39,15 +40,12 @@ static void get_exec_path(char *tpath, s
|
||||
}
|
||||
|
||||
static int oper_count;
|
||||
-static int fprintf_json(void *out, const char *fmt, ...)
|
||||
+static int printf_json(void *out, const char *fmt, va_list ap)
|
||||
{
|
||||
- va_list ap;
|
||||
char *s;
|
||||
int err;
|
||||
|
||||
- va_start(ap, fmt);
|
||||
err = vasprintf(&s, fmt, ap);
|
||||
- va_end(ap);
|
||||
if (err < 0)
|
||||
return -1;
|
||||
|
||||
@@ -73,6 +71,32 @@ static int fprintf_json(void *out, const
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int fprintf_json(void *out, const char *fmt, ...)
|
||||
+{
|
||||
+ va_list ap;
|
||||
+ int r;
|
||||
+
|
||||
+ va_start(ap, fmt);
|
||||
+ r = printf_json(out, fmt, ap);
|
||||
+ va_end(ap);
|
||||
+
|
||||
+ return r;
|
||||
+}
|
||||
+
|
||||
+static int fprintf_json_styled(void *out,
|
||||
+ enum disassembler_style style __maybe_unused,
|
||||
+ const char *fmt, ...)
|
||||
+{
|
||||
+ va_list ap;
|
||||
+ int r;
|
||||
+
|
||||
+ va_start(ap, fmt);
|
||||
+ r = printf_json(out, fmt, ap);
|
||||
+ va_end(ap);
|
||||
+
|
||||
+ return r;
|
||||
+}
|
||||
+
|
||||
void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
|
||||
const char *arch, const char *disassembler_options,
|
||||
const struct btf *btf,
|
||||
@@ -99,11 +123,13 @@ void disasm_print_insn(unsigned char *im
|
||||
assert(bfd_check_format(bfdf, bfd_object));
|
||||
|
||||
if (json_output)
|
||||
- init_disassemble_info(&info, stdout,
|
||||
- (fprintf_ftype) fprintf_json);
|
||||
+ init_disassemble_info_compat(&info, stdout,
|
||||
+ (fprintf_ftype) fprintf_json,
|
||||
+ fprintf_json_styled);
|
||||
else
|
||||
- init_disassemble_info(&info, stdout,
|
||||
- (fprintf_ftype) fprintf);
|
||||
+ init_disassemble_info_compat(&info, stdout,
|
||||
+ (fprintf_ftype) fprintf,
|
||||
+ fprintf_styled);
|
||||
|
||||
/* Update architecture info for offload. */
|
||||
if (arch) {
|
|
@ -0,0 +1,90 @@
|
|||
From 86fc59ef818beb0e1945d17f8e734898baba7e4e Mon Sep 17 00:00:00 2001
|
||||
From: Colin Foster <colin.foster@in-advantage.com>
|
||||
Date: Sun, 13 Mar 2022 15:45:23 -0700
|
||||
Subject: [PATCH 1/2] regmap: add configurable downshift for addresses
|
||||
|
||||
Add an additional reg_downshift to be applied to register addresses before
|
||||
any register accesses. An example of a device that uses this is a VSC7514
|
||||
chip, which require each register address to be downshifted by two if the
|
||||
access is performed over a SPI bus.
|
||||
|
||||
Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/20220313224524.399947-2-colin.foster@in-advantage.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/internal.h | 1 +
|
||||
drivers/base/regmap/regmap.c | 5 +++++
|
||||
include/linux/regmap.h | 3 +++
|
||||
3 files changed, 9 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/internal.h
|
||||
+++ b/drivers/base/regmap/internal.h
|
||||
@@ -31,6 +31,7 @@ struct regmap_format {
|
||||
size_t buf_size;
|
||||
size_t reg_bytes;
|
||||
size_t pad_bytes;
|
||||
+ size_t reg_downshift;
|
||||
size_t val_bytes;
|
||||
void (*format_write)(struct regmap *map,
|
||||
unsigned int reg, unsigned int val);
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -823,6 +823,7 @@ struct regmap *__regmap_init(struct devi
|
||||
|
||||
map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
|
||||
map->format.pad_bytes = config->pad_bits / 8;
|
||||
+ map->format.reg_downshift = config->reg_downshift;
|
||||
map->format.val_bytes = DIV_ROUND_UP(config->val_bits, 8);
|
||||
map->format.buf_size = DIV_ROUND_UP(config->reg_bits +
|
||||
config->val_bits + config->pad_bits, 8);
|
||||
@@ -1735,6 +1736,7 @@ static int _regmap_raw_write_impl(struct
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
map->write_flag_mask);
|
||||
@@ -1905,6 +1907,7 @@ static int _regmap_bus_formatted_write(v
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_write(map, reg, val);
|
||||
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
@@ -2346,6 +2349,7 @@ static int _regmap_raw_multi_reg_write(s
|
||||
unsigned int reg = regs[i].reg;
|
||||
unsigned int val = regs[i].def;
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(u8, reg, map->reg_shift);
|
||||
u8 += reg_bytes + pad_bytes;
|
||||
map->format.format_val(u8, val, 0);
|
||||
@@ -2673,6 +2677,7 @@ static int _regmap_raw_read(struct regma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
map->read_flag_mask);
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -237,6 +237,8 @@ typedef void (*regmap_unlock)(void *);
|
||||
* @reg_stride: The register address stride. Valid register addresses are a
|
||||
* multiple of this value. If set to 0, a value of 1 will be
|
||||
* used.
|
||||
+ * @reg_downshift: The number of bits to downshift the register before
|
||||
+ * performing any operations.
|
||||
* @pad_bits: Number of bits of padding between register and value.
|
||||
* @val_bits: Number of bits in a register value, mandatory.
|
||||
*
|
||||
@@ -360,6 +362,7 @@ struct regmap_config {
|
||||
|
||||
int reg_bits;
|
||||
int reg_stride;
|
||||
+ int reg_downshift;
|
||||
int pad_bits;
|
||||
int val_bits;
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
From 0074f3f2b1e43d3cedd97e47fb6980db6d2ba79e Mon Sep 17 00:00:00 2001
|
||||
From: Colin Foster <colin.foster@in-advantage.com>
|
||||
Date: Sun, 13 Mar 2022 15:45:24 -0700
|
||||
Subject: [PATCH 2/2] regmap: allow a defined reg_base to be added to every
|
||||
address
|
||||
|
||||
There's an inconsistency that arises when a register set can be accessed
|
||||
internally via MMIO, or externally via SPI. The VSC7514 chip allows both
|
||||
modes of operation. When internally accessed, the system utilizes __iomem,
|
||||
devm_ioremap_resource, and devm_regmap_init_mmio.
|
||||
|
||||
For SPI it isn't possible to utilize memory-mapped IO. To properly operate,
|
||||
the resource base must be added to the register before every operation.
|
||||
|
||||
Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/20220313224524.399947-3-colin.foster@in-advantage.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/internal.h | 1 +
|
||||
drivers/base/regmap/regmap.c | 6 ++++++
|
||||
include/linux/regmap.h | 3 +++
|
||||
3 files changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/internal.h
|
||||
+++ b/drivers/base/regmap/internal.h
|
||||
@@ -63,6 +63,7 @@ struct regmap {
|
||||
regmap_unlock unlock;
|
||||
void *lock_arg; /* This is passed to lock/unlock functions */
|
||||
gfp_t alloc_flags;
|
||||
+ unsigned int reg_base;
|
||||
|
||||
struct device *dev; /* Device we do I/O on */
|
||||
void *work_buf; /* Scratch buffer used to format I/O */
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -821,6 +821,8 @@ struct regmap *__regmap_init(struct devi
|
||||
else
|
||||
map->alloc_flags = GFP_KERNEL;
|
||||
|
||||
+ map->reg_base = config->reg_base;
|
||||
+
|
||||
map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
|
||||
map->format.pad_bytes = config->pad_bits / 8;
|
||||
map->format.reg_downshift = config->reg_downshift;
|
||||
@@ -1736,6 +1738,7 @@ static int _regmap_raw_write_impl(struct
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
@@ -1907,6 +1910,7 @@ static int _regmap_bus_formatted_write(v
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_write(map, reg, val);
|
||||
|
||||
@@ -2349,6 +2353,7 @@ static int _regmap_raw_multi_reg_write(s
|
||||
unsigned int reg = regs[i].reg;
|
||||
unsigned int val = regs[i].def;
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(u8, reg, map->reg_shift);
|
||||
u8 += reg_bytes + pad_bytes;
|
||||
@@ -2677,6 +2682,7 @@ static int _regmap_raw_read(struct regma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -239,6 +239,8 @@ typedef void (*regmap_unlock)(void *);
|
||||
* used.
|
||||
* @reg_downshift: The number of bits to downshift the register before
|
||||
* performing any operations.
|
||||
+ * @reg_base: Value to be added to every register address before performing any
|
||||
+ * operation.
|
||||
* @pad_bits: Number of bits of padding between register and value.
|
||||
* @val_bits: Number of bits in a register value, mandatory.
|
||||
*
|
||||
@@ -363,6 +365,7 @@ struct regmap_config {
|
||||
int reg_bits;
|
||||
int reg_stride;
|
||||
int reg_downshift;
|
||||
+ unsigned int reg_base;
|
||||
int pad_bits;
|
||||
int val_bits;
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
From 697c3892d825fb78f42ec8e53bed065dd728db3e Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Golle <daniel@makrotopia.org>
|
||||
Date: Mon, 30 Jan 2023 02:04:57 +0000
|
||||
Subject: [PATCH] regmap: apply reg_base and reg_downshift for single register
|
||||
ops
|
||||
|
||||
reg_base and reg_downshift currently don't have any effect if used with
|
||||
a regmap_bus or regmap_config which only offers single register
|
||||
operations (ie. reg_read, reg_write and optionally reg_update_bits).
|
||||
|
||||
Fix that and take them into account also for regmap_bus with only
|
||||
reg_read and read_write operations by applying reg_base and
|
||||
reg_downshift in _regmap_bus_reg_write, _regmap_bus_reg_read.
|
||||
|
||||
Also apply reg_base and reg_downshift in _regmap_update_bits, but only
|
||||
in case the operation is carried out with a reg_update_bits call
|
||||
defined in either regmap_bus or regmap_config.
|
||||
|
||||
Fixes: 0074f3f2b1e43d ("regmap: allow a defined reg_base to be added to every address")
|
||||
Fixes: 86fc59ef818beb ("regmap: add configurable downshift for addresses")
|
||||
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
|
||||
Tested-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/Y9clyVS3tQEHlUhA@makrotopia.org
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/regmap.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -1929,6 +1929,8 @@ static int _regmap_bus_reg_write(void *c
|
||||
{
|
||||
struct regmap *map = context;
|
||||
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
return map->bus->reg_write(map->bus_context, reg, val);
|
||||
}
|
||||
|
||||
@@ -2703,6 +2705,8 @@ static int _regmap_bus_reg_read(void *co
|
||||
{
|
||||
struct regmap *map = context;
|
||||
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
return map->bus->reg_read(map->bus_context, reg, val);
|
||||
}
|
||||
|
||||
@@ -3078,6 +3082,8 @@ static int _regmap_update_bits(struct re
|
||||
*change = false;
|
||||
|
||||
if (regmap_volatile(map, reg) && map->reg_update_bits) {
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
ret = map->reg_update_bits(map->bus_context, reg, mask, val);
|
||||
if (ret == 0 && change)
|
||||
*change = true;
|
|
@ -0,0 +1,35 @@
|
|||
From ebed787a0becb9354f0a23620a5130cccd6c730c Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Golle <daniel@makrotopia.org>
|
||||
Date: Thu, 19 Jan 2023 03:45:43 +0000
|
||||
Subject: [PATCH] mtd: spinand: macronix: use scratch buffer for DMA operation
|
||||
|
||||
The mx35lf1ge4ab_get_eccsr() function uses an SPI DMA operation to
|
||||
read the eccsr, hence the buffer should not be on stack. Since commit
|
||||
380583227c0c7f ("spi: spi-mem: Add extra sanity checks on the op param")
|
||||
the kernel emmits a warning and blocks such operations.
|
||||
|
||||
Use the scratch buffer to get eccsr instead of trying to directly read
|
||||
into a stack-allocated variable.
|
||||
|
||||
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
|
||||
Reviewed-by: Dhruva Gole <d-gole@ti.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/Y8i85zM0u4XdM46z@makrotopia.org
|
||||
---
|
||||
drivers/mtd/nand/spi/macronix.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/macronix.c
|
||||
+++ b/drivers/mtd/nand/spi/macronix.c
|
||||
@@ -83,9 +83,10 @@ static int mx35lf1ge4ab_ecc_get_status(s
|
||||
* in order to avoid forcing the wear-leveling layer to move
|
||||
* data around if it's not necessary.
|
||||
*/
|
||||
- if (mx35lf1ge4ab_get_eccsr(spinand, &eccsr))
|
||||
+ if (mx35lf1ge4ab_get_eccsr(spinand, spinand->scratchbuf))
|
||||
return nanddev_get_ecc_conf(nand)->strength;
|
||||
|
||||
+ eccsr = *spinand->scratchbuf;
|
||||
if (WARN_ON(eccsr > nanddev_get_ecc_conf(nand)->strength ||
|
||||
!eccsr))
|
||||
return nanddev_get_ecc_conf(nand)->strength;
|
|
@ -20,7 +20,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
#include <linux/mfd/syscon.h>
|
||||
#include <linux/regmap.h>
|
||||
#include <linux/clk.h>
|
||||
@@ -839,7 +840,7 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
@@ -840,7 +841,7 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
dma_addr_t dma_addr;
|
||||
int i;
|
||||
|
||||
|
@ -29,7 +29,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
cnt * sizeof(struct mtk_tx_dma),
|
||||
ð->phy_scratch_ring,
|
||||
GFP_ATOMIC);
|
||||
@@ -851,10 +852,10 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
@@ -852,10 +853,10 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
if (unlikely(!eth->scratch_head))
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
return -ENOMEM;
|
||||
|
||||
phy_ring_tail = eth->phy_scratch_ring +
|
||||
@@ -908,26 +909,26 @@ static void mtk_tx_unmap(struct mtk_eth
|
||||
@@ -909,26 +910,26 @@ static void mtk_tx_unmap(struct mtk_eth
|
||||
{
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
|
||||
|
@ -73,7 +73,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
dma_unmap_addr(tx_buf, dma_addr1),
|
||||
dma_unmap_len(tx_buf, dma_len1),
|
||||
DMA_TO_DEVICE);
|
||||
@@ -1005,9 +1006,9 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
@@ -1006,9 +1007,9 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
if (skb_vlan_tag_present(skb))
|
||||
txd4 |= TX_DMA_INS_VLAN | skb_vlan_tag_get(skb);
|
||||
|
||||
|
@ -85,7 +85,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
return -ENOMEM;
|
||||
|
||||
WRITE_ONCE(itxd->txd1, mapped_addr);
|
||||
@@ -1046,10 +1047,10 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
@@ -1047,10 +1048,10 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
|
||||
|
||||
frag_map_size = min(frag_size, MTK_TX_DMA_BUF_LEN);
|
||||
|
@ -98,7 +98,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
goto err_dma;
|
||||
|
||||
if (i == nr_frags - 1 &&
|
||||
@@ -1330,18 +1331,18 @@ static int mtk_poll_rx(struct napi_struc
|
||||
@@ -1331,18 +1332,18 @@ static int mtk_poll_rx(struct napi_struc
|
||||
netdev->stats.rx_dropped++;
|
||||
goto release_desc;
|
||||
}
|
||||
|
@ -120,7 +120,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
ring->buf_size, DMA_FROM_DEVICE);
|
||||
|
||||
/* receive data */
|
||||
@@ -1614,7 +1615,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
@@ -1615,7 +1616,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
if (!ring->buf)
|
||||
goto no_tx_mem;
|
||||
|
||||
|
@ -129,7 +129,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
&ring->phys, GFP_ATOMIC);
|
||||
if (!ring->dma)
|
||||
goto no_tx_mem;
|
||||
@@ -1632,7 +1633,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
@@ -1633,7 +1634,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
* descriptors in ring->dma_pdma.
|
||||
*/
|
||||
if (!MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
|
@ -138,7 +138,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
&ring->phys_pdma,
|
||||
GFP_ATOMIC);
|
||||
if (!ring->dma_pdma)
|
||||
@@ -1691,7 +1692,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
@@ -1692,7 +1693,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma) {
|
||||
|
@ -147,7 +147,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
MTK_DMA_SIZE * sizeof(*ring->dma),
|
||||
ring->dma,
|
||||
ring->phys);
|
||||
@@ -1699,7 +1700,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
@@ -1700,7 +1701,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma_pdma) {
|
||||
|
@ -156,7 +156,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
MTK_DMA_SIZE * sizeof(*ring->dma_pdma),
|
||||
ring->dma_pdma,
|
||||
ring->phys_pdma);
|
||||
@@ -1747,18 +1748,18 @@ static int mtk_rx_alloc(struct mtk_eth *
|
||||
@@ -1748,18 +1749,18 @@ static int mtk_rx_alloc(struct mtk_eth *
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
|
@ -178,7 +178,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
return -ENOMEM;
|
||||
ring->dma[i].rxd1 = (unsigned int)dma_addr;
|
||||
|
||||
@@ -1794,7 +1795,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
@@ -1795,7 +1796,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
continue;
|
||||
if (!ring->dma[i].rxd1)
|
||||
continue;
|
||||
|
@ -187,7 +187,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
ring->dma[i].rxd1,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
@@ -1805,7 +1806,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
@@ -1806,7 +1807,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma) {
|
||||
|
@ -196,7 +196,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
ring->dma_size * sizeof(*ring->dma),
|
||||
ring->dma,
|
||||
ring->phys);
|
||||
@@ -2161,7 +2162,7 @@ static void mtk_dma_free(struct mtk_eth
|
||||
@@ -2162,7 +2163,7 @@ static void mtk_dma_free(struct mtk_eth
|
||||
if (eth->netdev[i])
|
||||
netdev_reset_queue(eth->netdev[i]);
|
||||
if (eth->scratch_ring) {
|
||||
|
@ -205,7 +205,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
MTK_DMA_SIZE * sizeof(struct mtk_tx_dma),
|
||||
eth->scratch_ring,
|
||||
eth->phy_scratch_ring);
|
||||
@@ -2513,6 +2514,8 @@ static void mtk_dim_tx(struct work_struc
|
||||
@@ -2514,6 +2515,8 @@ static void mtk_dim_tx(struct work_struc
|
||||
|
||||
static int mtk_hw_init(struct mtk_eth *eth)
|
||||
{
|
||||
|
@ -214,7 +214,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
int i, val, ret;
|
||||
|
||||
if (test_and_set_bit(MTK_HW_INIT, ð->state))
|
||||
@@ -2525,6 +2528,10 @@ static int mtk_hw_init(struct mtk_eth *e
|
||||
@@ -2526,6 +2529,10 @@ static int mtk_hw_init(struct mtk_eth *e
|
||||
if (ret)
|
||||
goto err_disable_pm;
|
||||
|
||||
|
@ -225,7 +225,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
|
||||
ret = device_reset(eth->dev);
|
||||
if (ret) {
|
||||
@@ -3078,6 +3085,35 @@ free_netdev:
|
||||
@@ -3079,6 +3086,35 @@ free_netdev:
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -261,7 +261,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
static int mtk_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct device_node *mac_np;
|
||||
@@ -3091,6 +3127,7 @@ static int mtk_probe(struct platform_dev
|
||||
@@ -3092,6 +3128,7 @@ static int mtk_probe(struct platform_dev
|
||||
eth->soc = of_device_get_match_data(&pdev->dev);
|
||||
|
||||
eth->dev = &pdev->dev;
|
||||
|
@ -269,7 +269,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
eth->base = devm_platform_ioremap_resource(pdev, 0);
|
||||
if (IS_ERR(eth->base))
|
||||
return PTR_ERR(eth->base);
|
||||
@@ -3139,6 +3176,16 @@ static int mtk_probe(struct platform_dev
|
||||
@@ -3140,6 +3177,16 @@ static int mtk_probe(struct platform_dev
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -288,7 +288,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
GFP_KERNEL);
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
@@ -462,6 +462,12 @@
|
||||
@@ -463,6 +463,12 @@
|
||||
#define RSTCTRL_FE BIT(6)
|
||||
#define RSTCTRL_PPE BIT(31)
|
||||
|
||||
|
@ -301,7 +301,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
/* SGMII subsystem config registers */
|
||||
/* Register to auto-negotiation restart */
|
||||
#define SGMSYS_PCS_CONTROL_1 0x0
|
||||
@@ -879,6 +885,7 @@ struct mtk_sgmii {
|
||||
@@ -880,6 +886,7 @@ struct mtk_sgmii {
|
||||
/* struct mtk_eth - This is the main datasructure for holding the state
|
||||
* of the driver
|
||||
* @dev: The device pointer
|
||||
|
@ -309,7 +309,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
* @base: The mapped register i/o base
|
||||
* @page_lock: Make sure that register operations are atomic
|
||||
* @tx_irq__lock: Make sure that IRQ register operations are atomic
|
||||
@@ -922,6 +929,7 @@ struct mtk_sgmii {
|
||||
@@ -923,6 +930,7 @@ struct mtk_sgmii {
|
||||
|
||||
struct mtk_eth {
|
||||
struct device *dev;
|
||||
|
@ -317,7 +317,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
void __iomem *base;
|
||||
spinlock_t page_lock;
|
||||
spinlock_t tx_irq_lock;
|
||||
@@ -1020,6 +1028,7 @@ int mtk_gmac_rgmii_path_setup(struct mtk
|
||||
@@ -1021,6 +1029,7 @@ int mtk_gmac_rgmii_path_setup(struct mtk
|
||||
int mtk_eth_offload_init(struct mtk_eth *eth);
|
||||
int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
|
||||
void *type_data);
|
||||
|
|
|
@ -56,7 +56,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
|
||||
static int mtk_msg_level = -1;
|
||||
module_param_named(msg_level, mtk_msg_level, int, 0);
|
||||
@@ -3208,6 +3209,22 @@ static int mtk_probe(struct platform_dev
|
||||
@@ -3209,6 +3210,22 @@ static int mtk_probe(struct platform_dev
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -233,7 +233,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
static inline void
|
||||
--- a/include/linux/netdevice.h
|
||||
+++ b/include/linux/netdevice.h
|
||||
@@ -870,6 +870,7 @@ enum net_device_path_type {
|
||||
@@ -872,6 +872,7 @@ enum net_device_path_type {
|
||||
DEV_PATH_BRIDGE,
|
||||
DEV_PATH_PPPOE,
|
||||
DEV_PATH_DSA,
|
||||
|
@ -241,7 +241,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
};
|
||||
|
||||
struct net_device_path {
|
||||
@@ -895,6 +896,12 @@ struct net_device_path {
|
||||
@@ -897,6 +898,12 @@ struct net_device_path {
|
||||
int port;
|
||||
u16 proto;
|
||||
} dsa;
|
||||
|
|
|
@ -10,7 +10,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -2334,7 +2334,7 @@ static int mtk_open(struct net_device *d
|
||||
@@ -2335,7 +2335,7 @@ static int mtk_open(struct net_device *d
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
gdm_config = MTK_GDMA_TO_PPE;
|
||||
|
||||
mtk_gdm_config(eth, gdm_config);
|
||||
@@ -2408,7 +2408,7 @@ static int mtk_stop(struct net_device *d
|
||||
@@ -2409,7 +2409,7 @@ static int mtk_stop(struct net_device *d
|
||||
mtk_dma_free(eth);
|
||||
|
||||
if (eth->soc->offload_version)
|
||||
|
@ -28,7 +28,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
|
||||
return 0;
|
||||
}
|
||||
@@ -3300,10 +3300,11 @@ static int mtk_probe(struct platform_dev
|
||||
@@ -3301,10 +3301,11 @@ static int mtk_probe(struct platform_dev
|
||||
}
|
||||
|
||||
if (eth->soc->offload_version) {
|
||||
|
@ -45,7 +45,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
if (err)
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
@@ -982,7 +982,7 @@ struct mtk_eth {
|
||||
@@ -983,7 +983,7 @@ struct mtk_eth {
|
||||
u32 rx_dma_l4_valid;
|
||||
int ip_align;
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
#include <net/dsa.h>
|
||||
|
||||
#include "mtk_eth_soc.h"
|
||||
@@ -1292,7 +1293,7 @@ static int mtk_poll_rx(struct napi_struc
|
||||
@@ -1293,7 +1294,7 @@ static int mtk_poll_rx(struct napi_struc
|
||||
struct net_device *netdev;
|
||||
unsigned int pktlen;
|
||||
dma_addr_t dma_addr;
|
||||
|
@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
int mac;
|
||||
|
||||
ring = mtk_get_rx_ring(eth);
|
||||
@@ -1371,6 +1372,11 @@ static int mtk_poll_rx(struct napi_struc
|
||||
@@ -1372,6 +1373,11 @@ static int mtk_poll_rx(struct napi_struc
|
||||
skb_set_hash(skb, hash, PKT_HASH_TYPE_L4);
|
||||
}
|
||||
|
||||
|
@ -54,7 +54,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
|||
if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX &&
|
||||
(trxd.rxd2 & RX_DMA_VTAG))
|
||||
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
|
||||
@@ -3300,7 +3306,7 @@ static int mtk_probe(struct platform_dev
|
||||
@@ -3301,7 +3307,7 @@ static int mtk_probe(struct platform_dev
|
||||
}
|
||||
|
||||
if (eth->soc->offload_version) {
|
||||
|
|
|
@ -24,7 +24,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
mediatek,hifsys = <&hifsys>;
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -3187,7 +3187,7 @@ static int mtk_probe(struct platform_dev
|
||||
@@ -3188,7 +3188,7 @@ static int mtk_probe(struct platform_dev
|
||||
struct regmap *cci;
|
||||
|
||||
cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
|
||||
|
|
|
@ -13,7 +13,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -845,7 +845,7 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
@@ -846,7 +846,7 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
eth->scratch_ring = dma_alloc_coherent(eth->dma_dev,
|
||||
cnt * sizeof(struct mtk_tx_dma),
|
||||
ð->phy_scratch_ring,
|
||||
|
@ -22,7 +22,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
if (unlikely(!eth->scratch_ring))
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -1623,7 +1623,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
@@ -1624,7 +1624,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
goto no_tx_mem;
|
||||
|
||||
ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
|
||||
|
@ -31,7 +31,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
if (!ring->dma)
|
||||
goto no_tx_mem;
|
||||
|
||||
@@ -1641,8 +1641,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
@@ -1642,8 +1642,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
*/
|
||||
if (!MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
|
||||
|
@ -41,7 +41,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
if (!ring->dma_pdma)
|
||||
goto no_tx_mem;
|
||||
|
||||
@@ -1757,7 +1756,7 @@ static int mtk_rx_alloc(struct mtk_eth *
|
||||
@@ -1758,7 +1757,7 @@ static int mtk_rx_alloc(struct mtk_eth *
|
||||
|
||||
ring->dma = dma_alloc_coherent(eth->dma_dev,
|
||||
rx_dma_size * sizeof(*ring->dma),
|
||||
|
|
|
@ -14,7 +14,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -971,18 +971,51 @@ static void setup_tx_buf(struct mtk_eth
|
||||
@@ -972,18 +972,51 @@ static void setup_tx_buf(struct mtk_eth
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -69,7 +69,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
int k = 0;
|
||||
|
||||
itxd = ring->next_free;
|
||||
@@ -990,49 +1023,32 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
@@ -991,49 +1024,32 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
if (itxd == ring->last_free)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -126,7 +126,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
bool new_desc = true;
|
||||
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA) ||
|
||||
@@ -1047,23 +1063,17 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
@@ -1048,23 +1064,17 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
new_desc = false;
|
||||
}
|
||||
|
||||
|
@ -159,7 +159,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
|
||||
tx_buf = mtk_desc_to_tx_buf(ring, txd);
|
||||
if (new_desc)
|
||||
@@ -1073,20 +1083,17 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
@@ -1074,20 +1084,17 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
|
||||
MTK_TX_FLAGS_FPORT1;
|
||||
|
||||
|
@ -186,7 +186,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
txd_pdma->txd2 |= TX_DMA_LS0;
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
@@ -842,6 +842,17 @@ enum mkt_eth_capabilities {
|
||||
@@ -843,6 +843,17 @@ enum mkt_eth_capabilities {
|
||||
MTK_MUX_U3_GMAC2_TO_QPHY | \
|
||||
MTK_MUX_GMAC12_TO_GEPHY_SGMII | MTK_QDMA)
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -837,20 +837,20 @@ static void *mtk_max_lro_buf_alloc(gfp_t
|
||||
@@ -838,20 +838,20 @@ static void *mtk_max_lro_buf_alloc(gfp_t
|
||||
/* the qdma core needs scratch memory to be setup */
|
||||
static int mtk_init_fq_dma(struct mtk_eth *eth)
|
||||
{
|
||||
|
@ -38,7 +38,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
if (unlikely(!eth->scratch_head))
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -860,16 +860,19 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
@@ -861,16 +861,19 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -65,7 +65,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
}
|
||||
|
||||
mtk_w32(eth, eth->phy_scratch_ring, MTK_QDMA_FQ_HEAD);
|
||||
@@ -2169,6 +2172,7 @@ static int mtk_dma_init(struct mtk_eth *
|
||||
@@ -2170,6 +2173,7 @@ static int mtk_dma_init(struct mtk_eth *
|
||||
|
||||
static void mtk_dma_free(struct mtk_eth *eth)
|
||||
{
|
||||
|
@ -73,7 +73,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
int i;
|
||||
|
||||
for (i = 0; i < MTK_MAC_COUNT; i++)
|
||||
@@ -2176,9 +2180,8 @@ static void mtk_dma_free(struct mtk_eth
|
||||
@@ -2177,9 +2181,8 @@ static void mtk_dma_free(struct mtk_eth
|
||||
netdev_reset_queue(eth->netdev[i]);
|
||||
if (eth->scratch_ring) {
|
||||
dma_free_coherent(eth->dma_dev,
|
||||
|
@ -85,7 +85,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
eth->scratch_ring = NULL;
|
||||
eth->phy_scratch_ring = 0;
|
||||
}
|
||||
@@ -3390,6 +3393,9 @@ static const struct mtk_soc_data mt2701_
|
||||
@@ -3391,6 +3394,9 @@ static const struct mtk_soc_data mt2701_
|
||||
.hw_features = MTK_HW_FEATURES,
|
||||
.required_clks = MT7623_CLKS_BITMAP,
|
||||
.required_pctl = true,
|
||||
|
@ -95,7 +95,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
static const struct mtk_soc_data mt7621_data = {
|
||||
@@ -3398,6 +3404,9 @@ static const struct mtk_soc_data mt7621_
|
||||
@@ -3399,6 +3405,9 @@ static const struct mtk_soc_data mt7621_
|
||||
.required_clks = MT7621_CLKS_BITMAP,
|
||||
.required_pctl = false,
|
||||
.offload_version = 2,
|
||||
|
@ -105,7 +105,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
static const struct mtk_soc_data mt7622_data = {
|
||||
@@ -3407,6 +3416,9 @@ static const struct mtk_soc_data mt7622_
|
||||
@@ -3408,6 +3417,9 @@ static const struct mtk_soc_data mt7622_
|
||||
.required_clks = MT7622_CLKS_BITMAP,
|
||||
.required_pctl = false,
|
||||
.offload_version = 2,
|
||||
|
@ -115,7 +115,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
static const struct mtk_soc_data mt7623_data = {
|
||||
@@ -3415,6 +3427,9 @@ static const struct mtk_soc_data mt7623_
|
||||
@@ -3416,6 +3428,9 @@ static const struct mtk_soc_data mt7623_
|
||||
.required_clks = MT7623_CLKS_BITMAP,
|
||||
.required_pctl = true,
|
||||
.offload_version = 2,
|
||||
|
@ -125,7 +125,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
static const struct mtk_soc_data mt7629_data = {
|
||||
@@ -3423,6 +3438,9 @@ static const struct mtk_soc_data mt7629_
|
||||
@@ -3424,6 +3439,9 @@ static const struct mtk_soc_data mt7629_
|
||||
.hw_features = MTK_HW_FEATURES,
|
||||
.required_clks = MT7629_CLKS_BITMAP,
|
||||
.required_pctl = false,
|
||||
|
@ -135,7 +135,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
};
|
||||
|
||||
static const struct mtk_soc_data rt5350_data = {
|
||||
@@ -3430,6 +3448,9 @@ static const struct mtk_soc_data rt5350_
|
||||
@@ -3431,6 +3449,9 @@ static const struct mtk_soc_data rt5350_
|
||||
.hw_features = MTK_HW_FEATURES_MT7628,
|
||||
.required_clks = MT7628_CLKS_BITMAP,
|
||||
.required_pctl = false,
|
||||
|
@ -147,7 +147,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
const struct of_device_id of_mtk_match[] = {
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
@@ -863,6 +863,7 @@ struct mtk_tx_dma_desc_info {
|
||||
@@ -864,6 +864,7 @@ struct mtk_tx_dma_desc_info {
|
||||
* the target SoC
|
||||
* @required_pctl A bool value to show whether the SoC requires
|
||||
* the extra setup for those pins used by GMAC.
|
||||
|
@ -155,7 +155,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net>
|
|||
*/
|
||||
struct mtk_soc_data {
|
||||
u32 ana_rgc3;
|
||||
@@ -871,6 +872,9 @@ struct mtk_soc_data {
|
||||
@@ -872,6 +873,9 @@ struct mtk_soc_data {
|
||||
bool required_pctl;
|
||||
u8 offload_version;
|
||||
netdev_features_t hw_features;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue