Merge "Add kubernetes 1.30.6 patches"

This commit is contained in:
Zuul 2025-02-25 17:35:48 +00:00 committed by Gerrit Code Review
commit 8944bca456
11 changed files with 2528 additions and 0 deletions

View File

@ -0,0 +1,200 @@
From 63e8d996af35f797806db199728554ff13bc3fd9 Mon Sep 17 00:00:00 2001
From: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Date: Wed, 8 Jan 2025 07:28:12 -0500
Subject: [PATCH] kubeadm: create platform pods with zero CPU resources
This specifies zero CPU resources when creating the manifests
for the static platform pods, as a workaround for the lack of
separate resource tracking for platform resources.
This specifies zero CPU and Memory resources for the coredns
deployment. manifests.go is the main source file for this,
not sure if the coredns.yaml are used but they are updated to
be consistent.
This specifies CPU limit of 1 for kube-apiserver pod so that it is
treated as a burstable QoS. This gives a boost of cgroup CPUShares
since the burstable cgroup parent has significantly more CPUShares
than best-effort on typical systems. This improves kube-apiserver
API responsiveness.
This increases kube-apiserver Readiness probe periodSeconds to 10
based on WRS/SS joint recommendation for minimum probe settings.
This reduces likelihood of kube-apiserver probe failure and
subsequent pod-restart under servere load. This also reduces CPU
demand.
Signed-off-by: Daniel Safta <daniel.safta@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
---
cluster/addons/dns/coredns/coredns.yaml.base | 4 ++--
cluster/addons/dns/coredns/coredns.yaml.in | 4 ++--
cluster/addons/dns/coredns/coredns.yaml.sed | 4 ++--
cmd/kubeadm/app/phases/addons/dns/dns_test.go | 8 ++++----
cmd/kubeadm/app/phases/addons/dns/manifests.go | 4 ++--
.../app/phases/controlplane/manifests.go | 10 ++++++----
cmd/kubeadm/app/util/staticpod/utils.go | 17 ++++++++++++++++-
7 files changed, 34 insertions(+), 17 deletions(-)
diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base
index 3d438dce445..41088f063eb 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.base
+++ b/cluster/addons/dns/coredns/coredns.yaml.base
@@ -139,8 +139,8 @@ spec:
limits:
memory: __DNS__MEMORY__LIMIT__
requests:
- cpu: 100m
- memory: 70Mi
+ cpu: 0
+ memory: 0
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in
index 419acb0e966..906d6d28890 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.in
+++ b/cluster/addons/dns/coredns/coredns.yaml.in
@@ -139,8 +139,8 @@ spec:
limits:
memory: 'dns_memory_limit'
requests:
- cpu: 100m
- memory: 70Mi
+ cpu: 0
+ memory: 0
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed
index a35df71454f..af0fae57dbd 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.sed
+++ b/cluster/addons/dns/coredns/coredns.yaml.sed
@@ -139,8 +139,8 @@ spec:
limits:
memory: $DNS_MEMORY_LIMIT
requests:
- cpu: 100m
- memory: 70Mi
+ cpu: 0
+ memory: 0
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
diff --git a/cmd/kubeadm/app/phases/addons/dns/dns_test.go b/cmd/kubeadm/app/phases/addons/dns/dns_test.go
index 0b6c12eb41b..4f2022d7105 100644
--- a/cmd/kubeadm/app/phases/addons/dns/dns_test.go
+++ b/cmd/kubeadm/app/phases/addons/dns/dns_test.go
@@ -705,8 +705,8 @@ spec:
limits:
memory: 170Mi
requests:
- cpu: 100m
- memory: 70Mi
+ cpu: 0
+ memory: 0
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
@@ -970,8 +970,8 @@ spec:
limits:
memory: 170Mi
requests:
- cpu: 100m
- memory: 70Mi
+ cpu: 0
+ memory: 0
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go
index 905a2e050e6..2a2212d5d37 100644
--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go
+++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go
@@ -104,8 +104,8 @@ spec:
limits:
memory: 170Mi
requests:
- cpu: 100m
- memory: 70Mi
+ cpu: 0
+ memory: 0
args: [ "-conf", "/etc/coredns/Corefile" ]
volumeMounts:
- name: config-volume
diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go
index 11b93e083db..3458efe3a03 100644
--- a/cmd/kubeadm/app/phases/controlplane/manifests.go
+++ b/cmd/kubeadm/app/phases/controlplane/manifests.go
@@ -67,8 +67,10 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", endpoint.BindPort, v1.URISchemeHTTPS),
ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", endpoint.BindPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", endpoint.BindPort, v1.URISchemeHTTPS, componentHealthCheckTimeout),
- Resources: staticpodutil.ComponentResources("250m"),
- Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.APIServer.ExtraEnvs),
+ // WRS: Increase kube-apiserver cgroup CPUShares to improve API responsiveness;
+ // achieved by setting CPU Limits to make it burstable QoS.
+ Resources: staticpodutil.ComponentLimitResources("0", "1"),
+ Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.APIServer.ExtraEnvs),
}, mounts.GetVolumes(kubeadmconstants.KubeAPIServer),
map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}),
kubeadmconstants.KubeControllerManager: staticpodutil.ComponentPod(v1.Container{
@@ -79,7 +81,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, componentHealthCheckTimeout),
- Resources: staticpodutil.ComponentResources("200m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.ControllerManager.ExtraEnvs),
}, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil),
kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{
@@ -90,7 +92,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)),
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS),
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, componentHealthCheckTimeout),
- Resources: staticpodutil.ComponentResources("100m"),
+ Resources: staticpodutil.ComponentResources("0"),
Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.Scheduler.ExtraEnvs),
}, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil),
}
diff --git a/cmd/kubeadm/app/util/staticpod/utils.go b/cmd/kubeadm/app/util/staticpod/utils.go
index fcd28eff4d0..c87d6c954eb 100644
--- a/cmd/kubeadm/app/util/staticpod/utils.go
+++ b/cmd/kubeadm/app/util/staticpod/utils.go
@@ -99,6 +99,18 @@ func ComponentResources(cpu string) v1.ResourceRequirements {
}
}
+// ComponentLimitResources returns the v1.ResourceRequirements object needed for allocating a specified amount of the CPU with Limits
+func ComponentLimitResources(cpu string, lcpu string) v1.ResourceRequirements {
+ return v1.ResourceRequirements{
+ Requests: v1.ResourceList{
+ v1.ResourceCPU: resource.MustParse(cpu),
+ },
+ Limits: v1.ResourceList{
+ v1.ResourceCPU: resource.MustParse(lcpu),
+ },
+ }
+}
+
// NewVolume creates a v1.Volume with a hostPath mount to the specified location
func NewVolume(name, path string, pathType *v1.HostPathType) v1.Volume {
return v1.Volume{
@@ -255,7 +267,10 @@ func LivenessProbe(host, path string, port int32, scheme v1.URIScheme) *v1.Probe
func ReadinessProbe(host, path string, port int32, scheme v1.URIScheme) *v1.Probe {
// sets initialDelaySeconds as '0' because we don't want to delay user infrastructure checks
// looking for "ready" status on kubeadm static Pods
- return createHTTPProbe(host, path, port, scheme, 0, 15, 3, 1)
+ // WRS/SS joint recommendation: All pods probes should have following minimum probe
+ // settings unless required by the service (initialDelaySecond 0, periodSeconds 10,
+ // timeoutSeconds 5, successThreshold 1, failureThreshold 3)
+ return createHTTPProbe(host, path, port, scheme, 0, 15, 3, 10)
}
// StartupProbe creates a Probe object with a HTTPGet handler
--
2.25.1

View File

@ -0,0 +1,88 @@
From 6725ab07375fc9fc8035b8919b6f2b7f601168c5 Mon Sep 17 00:00:00 2001
From: Ferdinando Terada <Ferdinando.GodoyTerada@windriver.com>
Date: Mon, 23 Dec 2024 17:53:09 -0300
Subject: [PATCH] Adjust timeout for coredns readinessProbe
The timeout value for the readinessProbe of CoreDNS was increased.
This adjustment was necessary to avoid issues during stress testing,
ensuring that the component can properly handle high-load situations
and prevent premature failure in readiness checks.
---
cluster/addons/dns/coredns/coredns.yaml.base | 1 +
cluster/addons/dns/coredns/coredns.yaml.in | 1 +
cluster/addons/dns/coredns/coredns.yaml.sed | 1 +
cmd/kubeadm/app/phases/addons/dns/dns_test.go | 2 ++
cmd/kubeadm/app/phases/addons/dns/manifests.go | 1 +
5 files changed, 6 insertions(+)
diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base
index 41088f063eb..4bcb2b2e4fe 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.base
+++ b/cluster/addons/dns/coredns/coredns.yaml.base
@@ -170,6 +170,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
+ timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in
index 906d6d28890..744f8cb730a 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.in
+++ b/cluster/addons/dns/coredns/coredns.yaml.in
@@ -170,6 +170,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
+ timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed
index af0fae57dbd..844f21f2abf 100644
--- a/cluster/addons/dns/coredns/coredns.yaml.sed
+++ b/cluster/addons/dns/coredns/coredns.yaml.sed
@@ -170,6 +170,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
+ timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
diff --git a/cmd/kubeadm/app/phases/addons/dns/dns_test.go b/cmd/kubeadm/app/phases/addons/dns/dns_test.go
index 4f2022d7105..1f571f48bb8 100644
--- a/cmd/kubeadm/app/phases/addons/dns/dns_test.go
+++ b/cmd/kubeadm/app/phases/addons/dns/dns_test.go
@@ -736,6 +736,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
+ timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
@@ -1001,6 +1002,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
+ timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go
index 2a2212d5d37..c0be57357e4 100644
--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go
+++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go
@@ -135,6 +135,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
+ timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
--
2.25.1

View File

@ -0,0 +1,33 @@
From 08425e565a3dff5d8b5b7cb2e9991280398cf21f Mon Sep 17 00:00:00 2001
From: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Date: Fri, 17 Jan 2025 05:09:55 -0500
Subject: [PATCH] kubeadm: reduce UpgradeManifestTimeout
This modifies kubeadm UpgradeManifestTimeout from 5 minutes default
to 3 minutes to reduce the unnecessary delay in retries during
kubeadm-upgrade-apply failures.
The typical control-plane upgrade of static pods is 75 to 85 seconds,
so 3 minutes gives adequate buffer to complete the operation.
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
---
cmd/kubeadm/app/constants/constants.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cmd/kubeadm/app/constants/constants.go b/cmd/kubeadm/app/constants/constants.go
index 0a3e0a15fce..02357d557da 100644
--- a/cmd/kubeadm/app/constants/constants.go
+++ b/cmd/kubeadm/app/constants/constants.go
@@ -235,7 +235,7 @@ const (
KubeletHealthCheckTimeout = 4 * time.Minute
// UpgradeManifestsTimeout specifies the default timeout for upgradring static Pod manifests
- UpgradeManifestsTimeout = 5 * time.Minute
+ UpgradeManifestsTimeout = 3 * time.Minute
// PullImageRetry specifies how many times ContainerRuntime retries when pulling image failed
PullImageRetry = 5
--
2.25.1

View File

@ -0,0 +1,30 @@
From 45806d8b57d2bfdc24165264ea2f6aabadcd2e1a Mon Sep 17 00:00:00 2001
From: Boovan Rajendran <boovan.rajendran@windriver.com>
Date: Mon, 4 Sep 2023 08:05:29 -0400
Subject: [PATCH] kubelet CFS quota throttling for non integer cpulimit
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
---
pkg/kubelet/cm/internal_container_lifecycle_linux.go | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
index 376efa9e0ef..a7e56ac6911 100644
--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go
+++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
@@ -40,7 +40,11 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain
// Disable cgroup CFS throttle at the container level.
// /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_quota_us
// /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_period_us
- if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed {
+ // We can only set CpuQuota to -1 if we're allocating the entire CPU.
+ // For fractional CPUs the CpuQuota is needed to enforce the limit.
+ cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
+ fractionalCpuQuantity := cpuQuantity.MilliValue()%1000
+ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 {
containerConfig.Linux.Resources.CpuPeriod = int64(100000)
containerConfig.Linux.Resources.CpuQuota = int64(-1)
}
--
2.25.1

View File

@ -0,0 +1,256 @@
From 521099d83512e74d11829b464d899c05b595d482 Mon Sep 17 00:00:00 2001
From: Boovan Rajendran <boovan.rajendran@windriver.com>
Date: Mon, 26 Feb 2024 06:27:48 -0500
Subject: [PATCH] kubelet cpumanager disable CFS quota throttling
This disables CFS CPU quota to avoid performance degradation due to
Linux kernel CFS quota implementation. Note that 4.18 kernel attempts
to solve the CFS throttling problem, but there are reports that it is
not completely effective.
This disables CFS quota throttling for Guaranteed pods for both
parent and container cgroups by writing -1 to cgroup cpu.cfs_quota_us.
Disabling has a dramatic latency improvement for HTTP response times.
This patch is refactored in 1.22.5 due to new internal_container_lifecycle
framework. We leverage the same mechanism to set Linux resources as:
cpu manager: specify the container CPU set during the creation
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 7 +++
pkg/kubelet/cm/cpumanager/fake_cpu_manager.go | 10 ++++-
pkg/kubelet/cm/helpers_linux.go | 10 +++++
pkg/kubelet/cm/helpers_linux_test.go | 43 ++++++++++---------
.../cm/internal_container_lifecycle_linux.go | 9 ++++
5 files changed, 57 insertions(+), 22 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 8b5049d7d74..2d1b6eefebe 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -73,6 +73,9 @@ type Manager interface {
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
+ // GetCPUPolicy returns the assigned CPU manager policy
+ GetCPUPolicy() string
+
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
@@ -243,6 +246,10 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
return nil
}
+func (m *manager) GetCPUPolicy() string {
+ return m.policy.Name()
+}
+
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
// The pod is during the admission phase. We need to save the pod to avoid it
// being cleaned before the admission ended
diff --git a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
index 4a03f3dd23f..b36fb0da3b4 100644
--- a/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/fake_cpu_manager.go
@@ -28,7 +28,8 @@ import (
)
type fakeManager struct {
- state state.State
+ policy Policy
+ state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
@@ -70,6 +71,10 @@ func (m *fakeManager) State() state.Reader {
return m.state
}
+func (m *fakeManager) GetCPUPolicy() string {
+ return m.policy.Name()
+}
+
func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
@@ -88,6 +93,7 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
- state: state.NewMemoryState(),
+ policy: &nonePolicy{},
+ state: state.NewMemoryState(),
}
}
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
index 8a144e7a73c..008b955ee98 100644
--- a/pkg/kubelet/cm/helpers_linux.go
+++ b/pkg/kubelet/cm/helpers_linux.go
@@ -170,6 +170,16 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
+ // Disable CFS CPU quota to avoid performance degradation due to
+ // Linux kernel CFS throttle implementation.
+ // NOTE: 4.18 kernel attempts to solve CFS throttling problem,
+ // but there are reports that it is not completely effective.
+ // This will configure cgroup CFS parameters at pod level:
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_quota_us
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/cpu.cfs_period_us
+ cpuQuota = int64(-1)
+ cpuPeriod = uint64(100000)
+
result.CPUShares = &cpuShares
result.CPUQuota = &cpuQuota
result.CPUPeriod = &cpuPeriod
diff --git a/pkg/kubelet/cm/helpers_linux_test.go b/pkg/kubelet/cm/helpers_linux_test.go
index fba41fd49be..60609394659 100644
--- a/pkg/kubelet/cm/helpers_linux_test.go
+++ b/pkg/kubelet/cm/helpers_linux_test.go
@@ -64,8 +64,9 @@ func TestResourceConfigForPod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuotaPeriod := uint64(100000)
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -204,8 +205,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
@@ -218,8 +219,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
@@ -232,8 +233,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
@@ -246,8 +247,8 @@ func TestResourceConfigForPod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"burstable-partial-limits-with-init-containers": {
pod: &v1.Pod{
@@ -309,8 +310,10 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
- guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
- guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
+ guaranteedQuotaPeriod := uint64(100000)
+ guaranteedQuota := int64(-1)
+ guaranteedTunedQuota := int64(-1)
+
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
@@ -449,8 +452,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
@@ -463,8 +466,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: defaultQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
@@ -477,8 +480,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: true,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &guaranteedTunedQuota, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
@@ -491,8 +494,8 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
},
},
enforceCPULimits: false,
- quotaPeriod: tunedQuotaPeriod,
- expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory},
+ quotaPeriod: guaranteedQuotaPeriod,
+ expected: &ResourceConfig{CPUShares: &guaranteedShares, CPUQuota: &cpuNoLimit, CPUPeriod: &guaranteedQuotaPeriod, Memory: &guaranteedMemory},
},
}
diff --git a/pkg/kubelet/cm/internal_container_lifecycle_linux.go b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
index 0c3bb2e4999..376efa9e0ef 100644
--- a/pkg/kubelet/cm/internal_container_lifecycle_linux.go
+++ b/pkg/kubelet/cm/internal_container_lifecycle_linux.go
@@ -26,6 +26,7 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
+ v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
@@ -36,6 +37,14 @@ func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, contain
}
}
+ // Disable cgroup CFS throttle at the container level.
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_quota_us
+ // /sys/fs/cgroup/cpu/k8s-infra/kubepods/<pod>/<container>/cpu.cfs_period_us
+ if i.cpuManager.GetCPUPolicy() == "static" && v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed {
+ containerConfig.Linux.Resources.CpuPeriod = int64(100000)
+ containerConfig.Linux.Resources.CpuQuota = int64(-1)
+ }
+
if i.memoryManager != nil {
numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container)
if numaNodes.Len() > 0 {
--
2.25.1

View File

@ -0,0 +1,789 @@
From af61145f1ccd068da8937e6b20107f25023cdcb0 Mon Sep 17 00:00:00 2001
From: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Date: Tue, 11 Feb 2025 05:27:58 -0500
Subject: [PATCH] kubelet cpumanager introduce concept of isolated CPUs
This introduces the concept of "isolated CPUs", which are CPUs that
have been isolated at the kernel level via the "isolcpus" kernel boot
parameter.
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via
'--system-reserved=cpu' will be used for infrastructure pods while the
isolated CPUs should be reserved via '--kube-reserved=cpu' to cause
kubelet to skip over them for "normal" CPU resource tracking. The
kubelet code will double-check that the specified isolated CPUs match
what the kernel exposes in "/sys/devices/system/cpu/isolated".
A plugin (outside the scope of this commit) will expose the isolated
CPUs to kubelet via the device plugin API.
If a pod specifies some number of "isolcpus" resources, the device
manager will allocate them. In this code we check whether such
resources have been allocated, and if so we set the container cpuset to
the isolated CPUs. This does mean that it really only makes sense to
specify "isolcpus" resources for best-effort or burstable pods, not for
guaranteed ones since that would throw off the accounting code. In
order to ensure the accounting still works as designed, if "isolcpus"
are specified for guaranteed pods, the affinity will be set to the
non-isolated CPUs.
This patch was refactored in 1.21.3 due to upstream API change
node: podresources: make GetDevices() consistent
(commit ad68f9588c72d6477b5a290c548a9031063ac659).
The routine podIsolCPUs() was refactored in 1.21.3 since the API
p.deviceManager.GetDevices() is returning multiple devices with
a device per cpu. The resultant cpuset needs to be the aggregate.
The routine NewStaticPolicy was refactored in 1.22.5, adding a new argument
in its signature: cpuPolicyOptions map[string]string. This change is implies
shifting the new arguments(deviceManager, excludeReserved) with one position
to the right.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Co-authored-by: Chris Friesen <chris.friesen@windriver.com>
Signed-off-by: Gleb Aronsky <gleb.aronsky@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
---
pkg/kubelet/cm/container_manager_linux.go | 1 +
pkg/kubelet/cm/cpumanager/cpu_manager.go | 35 +++++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 31 +++--
pkg/kubelet/cm/cpumanager/policy_static.go | 90 +++++++++++++-
.../cm/cpumanager/policy_static_test.go | 57 +++++++--
pkg/kubelet/cm/devicemanager/manager_stub.go | 110 ++++++++++++++++++
6 files changed, 298 insertions(+), 26 deletions(-)
create mode 100644 pkg/kubelet/cm/devicemanager/manager_stub.go
diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go
index 4ed1aa7b591..7eb76b4c20f 100644
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@@ -324,6 +324,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
cm.topologyManager,
+ cm.deviceManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize cpu manager")
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index e0a359932b7..acd51fdef8d 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -20,6 +20,8 @@ import (
"context"
"fmt"
"math"
+ "os"
+ "strings"
"sync"
"time"
@@ -32,6 +34,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@@ -51,6 +54,25 @@ type policyName string
// cpuManagerStateFileName is the file name where cpu manager stores its state
const cpuManagerStateFileName = "cpu_manager_state"
+// get the system-level isolated CPUs
+func getIsolcpus() cpuset.CPUSet {
+ dat, err := os.ReadFile("/sys/devices/system/cpu/isolated")
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to read sysfs isolcpus subdir")
+ return cpuset.New()
+ }
+
+ // The isolated cpus string ends in a newline
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cset, err := cpuset.Parse(cpustring)
+ if err != nil {
+ klog.Errorf("[cpumanager] unable to parse sysfs isolcpus string to cpuset")
+ return cpuset.New()
+ }
+
+ return cset
+}
+
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
@@ -154,7 +176,8 @@ func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManager creates new cpu manager based on provided policy
-func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
+func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store, deviceManager devicemanager.Manager) (Manager, error) {
+
var topo *topology.CPUTopology
var policy Policy
var err error
@@ -195,7 +218,15 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
// NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
// This variable is primarily to make testing easier.
excludeReserved := true
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved)
+
+ // isolCPUs is the set of kernel-isolated CPUs. They should be a subset of specificCPUs or
+ // of the CPUs that NewStaticPolicy() will pick if numReservedCPUs is set. It's only in the
+ // argument list here for ease of testing, it's really internal to the policy.
+ isolCPUs := getIsolcpus()
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, isolCPUs, affinity, cpuPolicyOptions, deviceManager, excludeReserved)
+ if err != nil {
+ return nil, fmt.Errorf("new static policy error: %v", err)
+ }
if err != nil {
return nil, fmt.Errorf("new static policy error: %w", err)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index e5f5d07a2ad..8abf173ed5c 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -36,6 +36,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/utils/cpuset"
)
@@ -263,6 +264,7 @@ func makeMultiContainerPodWithOptions(initCPUs, appCPUs []*containerOptions) *v1
}
func TestCPUManagerAdd(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -278,9 +280,11 @@ func TestCPUManagerAdd(t *testing.T) {
},
0,
cpuset.New(),
+ cpuset.New(),
topologymanager.NewFakeManager(),
nil,
- testExcl)
+ testDM,
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -530,8 +534,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
}
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl)
mockState := &mockState{
assignments: testCase.stAssignments,
@@ -686,7 +691,9 @@ func TestCPUManagerGenerate(t *testing.T) {
}
defer os.RemoveAll(sDir)
- mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.New(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager())
+ testDM, err := devicemanager.NewManagerStub()
+ mgr, err := NewManager(testCase.cpuPolicyName, nil, 5*time.Second, machineInfo, cpuset.New(), testCase.nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM)
+
if testCase.expectedError != nil {
if !strings.Contains(err.Error(), testCase.expectedError.Error()) {
t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), testCase.expectedError.Error())
@@ -757,6 +764,7 @@ func TestCPUManagerRemove(t *testing.T) {
func TestReconcileState(t *testing.T) {
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 8,
@@ -775,9 +783,11 @@ func TestReconcileState(t *testing.T) {
},
0,
cpuset.New(),
+ cpuset.New(),
topologymanager.NewFakeManager(),
nil,
- testExcl)
+ testDM,
+ testExcl)
testCases := []struct {
description string
@@ -1282,6 +1292,7 @@ func TestReconcileState(t *testing.T) {
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
testExcl := false
+ testDM, _ := devicemanager.NewManagerStub()
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -1296,9 +1307,11 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
},
1,
cpuset.New(0),
+ cpuset.New(),
topologymanager.NewFakeManager(),
nil,
- testExcl)
+ testDM,
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -1410,7 +1423,8 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) {
}
defer os.RemoveAll(sDir)
- _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.New(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager())
+ testDM, err := devicemanager.NewManagerStub()
+ _, err = NewManager(testCase.cpuPolicyName, testCase.cpuPolicyOptions, 5*time.Second, machineInfo, cpuset.New(), nodeAllocatableReservation, sDir, topologymanager.NewFakeManager(), testDM)
if err == nil {
t.Errorf("Expected error, but NewManager succeeded")
}
@@ -1424,6 +1438,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) {
func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
testExcl := false
+ testDm, _ := devicemanager.NewManagerStub()
nonePolicy, _ := NewNonePolicy(nil)
staticPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -1439,9 +1454,11 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
},
1,
cpuset.New(0),
+ cpuset.New(),
topologymanager.NewFakeManager(),
nil,
- testExcl)
+ testDm,
+ testExcl)
testCases := []struct {
description string
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index fff571491f0..22f2962de83 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -19,6 +19,7 @@ package cpumanager
import (
"context"
"fmt"
+ "strconv"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -33,6 +34,7 @@ import (
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/metrics"
@@ -128,6 +130,10 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reservedCPUs cpuset.CPUSet
+ // subset of reserved CPUs with isolcpus attribute
+ isolcpus cpuset.CPUSet
+ // parent containerManager, used to get device list
+ deviceManager devicemanager.Manager
// If true, default CPUSet should exclude reserved CPUs
excludeReserved bool
// Superset of reservedCPUs. It includes not just the reservedCPUs themselves,
@@ -150,7 +156,8 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, isolCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, deviceManager devicemanager.Manager, excludeReserved bool) (Policy, error) {
+
opts, err := NewStaticPolicyOptions(cpuPolicyOptions)
if err != nil {
return nil, err
@@ -165,6 +172,8 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
policy := &staticPolicy{
topology: topology,
affinity: affinity,
+ isolcpus: isolCPUs,
+ deviceManager: deviceManager,
excludeReserved: excludeReserved,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
@@ -201,6 +210,12 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
policy.reservedCPUs = reserved
policy.reservedPhysicalCPUs = reservedPhysicalCPUs
+ if !isolCPUs.IsSubsetOf(reserved) {
+ klog.Errorf("[cpumanager] isolCPUs %v is not a subset of reserved %v", isolCPUs, reserved)
+ reserved = reserved.Union(isolCPUs)
+ klog.Warningf("[cpumanager] mismatch isolCPUs %v, force reserved %v", isolCPUs, reserved)
+ }
+
return policy, nil
}
@@ -234,8 +249,9 @@ func (p *staticPolicy) validateState(s state.State) error {
} else {
s.SetDefaultCPUSet(allCPUs)
}
- klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
- allCPUs, p.reservedCPUs, s.GetDefaultCPUSet())
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, isolcpus:%v, default:%v\n",
+ allCPUs, p.reservedCPUs, p.isolcpus, s.GetDefaultCPUSet())
+
return nil
}
@@ -340,16 +356,46 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
return nil
}
- cpuset := p.reservedCPUs
+ cpuset := p.reservedCPUs.Clone().Difference(p.isolcpus)
if cpuset.IsEmpty() {
// If this happens then someone messed up.
- return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs)
+ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v, isolcpus:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs, p.isolcpus)
+
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset)
return nil
}
+ // This corrects kubelet cpumanager static cpuset tracking for isolcpus,
+ // since version 1.26.1 . This ensures that pods specified with
+ // isolcpus + guaranteed QoS + integer cpu requests, are affined to
+ // exclusive cpuset and tracked as non-isolated cpus.
+ cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
+ fractionalCpuQuantity := cpuQuantity.MilliValue() % 1000
+ if isolcpus := p.podIsolCPUs(pod, container); isolcpus.Size() > 0 &&
+ v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed && fractionalCpuQuantity == 0 {
+ // container has requested isolated CPUs
+ if set, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ if set.Equals(isolcpus) {
+ klog.Infof("[cpumanager] isolcpus container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ } else {
+ klog.Infof("[cpumanager] isolcpus container state has cpus %v, should be %v (namespace: %s, pod UID: %s, pod: %s, container: %s)",
+ isolcpus, set, pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ }
+ }
+ // Note that we do not do anything about init containers here.
+ // It looks like devices are allocated per-pod based on effective requests/limits
+ // and extra devices from initContainers are not freed up when the regular containers start.
+ // TODO: confirm this is still true for 1.20
+ s.SetCPUSet(string(pod.UID), container.Name, isolcpus)
+ klog.Infof("[cpumanager] isolcpus: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, isolcpus)
+ return nil
+ }
+
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
@@ -415,7 +461,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
-
+ klog.Infof("[cpumanager] guaranteed: AddContainer "+
+ "(namespace: %s, pod UID: %s, pod: %s, container: %s); numCPUS=%d, cpuset=%v",
+ pod.Namespace, string(pod.UID), pod.Name, container.Name, numCPUs, cpuset)
return nil
}
@@ -794,6 +842,36 @@ func isKubeInfra(pod *v1.Pod) bool {
}
+// get the isolated CPUs (if any) from the devices associated with a specific container
+func (p *staticPolicy) podIsolCPUs(pod *v1.Pod, container *v1.Container) cpuset.CPUSet {
+ // NOTE: This is required for TestStaticPolicyAdd() since makePod() does
+ // not create UID. We also need a way to properly stub devicemanager.
+ if len(string(pod.UID)) == 0 {
+ return cpuset.New()
+ }
+ resContDevices := p.deviceManager.GetDevices(string(pod.UID), container.Name)
+ cpuSet := cpuset.New()
+ for resourceName, resourceDevs := range resContDevices {
+ // this resource name needs to match the isolcpus device plugin
+ if resourceName == "windriver.com/isolcpus" {
+ for devID, _ := range resourceDevs {
+ cpuStrList := []string{devID}
+ if len(cpuStrList) > 0 {
+ // loop over the list of strings, convert each one to int, add to cpuset
+ for _, cpuStr := range cpuStrList {
+ cpu, err := strconv.Atoi(cpuStr)
+ if err != nil {
+ panic(err)
+ }
+ cpuSet = cpuSet.Union(cpuset.New(cpu))
+ }
+ }
+ }
+ }
+ }
+ return cpuSet
+}
+
// isHintSocketAligned function return true if numa nodes in hint are socket aligned.
func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool {
numaNodesBitMask := hint.NUMANodeAffinity.GetBits()
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 83614619550..ee70a833d33 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -30,6 +30,7 @@ import (
pkgfeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
+ "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/utils/cpuset"
@@ -73,8 +74,9 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest {
}
func TestStaticPolicyName(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testExcl := false
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl)
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -84,6 +86,7 @@ func TestStaticPolicyName(t *testing.T) {
}
func TestStaticPolicyStart(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "non-corrupted state",
@@ -159,7 +162,7 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testCase.excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
@@ -207,7 +210,6 @@ func TestStaticPolicyAdd(t *testing.T) {
largeTopoCPUSet := cpuset.New(largeTopoCPUids...)
largeTopoSock0CPUSet := cpuset.New(largeTopoSock0CPUids...)
largeTopoSock1CPUSet := cpuset.New(largeTopoSock1CPUids...)
-
// these are the cases which must behave the same regardless the policy options.
// So we will permutate the options to ensure this holds true.
@@ -630,7 +632,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) {
cpus = testCase.reservedCPUs.Clone()
}
testExcl := false
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options, testExcl)
+ testDM, _ := devicemanager.NewManagerStub()
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, cpus, tm, testCase.options, testDM, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
@@ -677,6 +680,8 @@ func runStaticPolicyTestCaseWithFeatureGate(t *testing.T, testCase staticPolicyT
}
func TestStaticPolicyReuseCPUs(t *testing.T) {
+ excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []struct {
staticPolicyTest
expCSetAfterAlloc cpuset.CPUSet
@@ -701,7 +706,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -733,6 +738,8 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
}
func TestStaticPolicyDoNotReuseCPUs(t *testing.T) {
+ excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []struct {
staticPolicyTest
expCSetAfterAlloc cpuset.CPUSet
@@ -754,7 +761,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -779,6 +786,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) {
func TestStaticPolicyRemove(t *testing.T) {
excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
@@ -837,7 +845,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -860,6 +868,7 @@ func TestStaticPolicyRemove(t *testing.T) {
func TestTopologyAwareAllocateCPUs(t *testing.T) {
excludeReserved := false
+ testDM, _ := devicemanager.NewManagerStub()
testCases := []struct {
description string
topo *topology.CPUTopology
@@ -928,7 +937,8 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved)
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, excludeReserved)
+
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -961,6 +971,7 @@ type staticPolicyTestWithResvList struct {
topo *topology.CPUTopology
numReservedCPUs int
reserved cpuset.CPUSet
+ isolcpus cpuset.CPUSet
stAssignments state.ContainerCPUAssignments
stDefaultCPUSet cpuset.CPUSet
pod *v1.Pod
@@ -972,6 +983,8 @@ type staticPolicyTestWithResvList struct {
}
func TestStaticPolicyStartWithResvList(t *testing.T) {
+ testDM, _ := devicemanager.NewManagerStub()
+ testExcl := false
testCases := []staticPolicyTestWithResvList{
{
description: "empty cpuset",
@@ -1001,10 +1014,9 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
},
}
- testExcl := false
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, cpuset.New(), topologymanager.NewFakeManager(), nil, testDM, testExcl)
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
@@ -1053,6 +1065,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 1,
reserved: cpuset.New(0),
+ isolcpus: cpuset.New(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
@@ -1066,6 +1079,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.New(0, 1),
+ isolcpus: cpuset.New(),
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
@@ -1079,6 +1093,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.New(0, 1),
+ isolcpus: cpuset.New(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.New(2, 3, 6, 7),
@@ -1096,6 +1111,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.New(0, 1),
+ isolcpus: cpuset.New(),
stAssignments: state.ContainerCPUAssignments{
"fakePod": map[string]cpuset.CPUSet{
"fakeContainer100": cpuset.New(2, 3, 6, 7),
@@ -1108,11 +1124,30 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
expCPUAlloc: true,
expCSet: cpuset.New(0, 1),
},
+ {
+ description: "InfraPod, SingleSocketHT, Isolcpus, ExpectAllocReserved",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.New(0, 1),
+ isolcpus: cpuset.New(1),
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.New(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.New(4, 5),
+ pod: infraPod,
+ isKubeInfraPodfunc: fakeIsKubeInfraTrue,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.New(0),
+ },
}
testExcl := true
+ testDM, _ := devicemanager.NewManagerStub()
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, testCase.isolcpus, topologymanager.NewFakeManager(), nil, testDM, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
diff --git a/pkg/kubelet/cm/devicemanager/manager_stub.go b/pkg/kubelet/cm/devicemanager/manager_stub.go
new file mode 100644
index 00000000000..f0e725c6c9f
--- /dev/null
+++ b/pkg/kubelet/cm/devicemanager/manager_stub.go
@@ -0,0 +1,110 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package devicemanager
+
+import (
+ v1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/util/sets"
+ "k8s.io/kubernetes/pkg/kubelet/cm/containermap"
+ "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
+ "k8s.io/kubernetes/pkg/kubelet/config"
+ "k8s.io/kubernetes/pkg/kubelet/lifecycle"
+ "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
+ schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
+)
+
+// ManagerStub provides a simple stub implementation for the Device Manager.
+type ManagerStub struct {
+ // containerMap provides a mapping from (pod, container) -> containerID
+ // for all containers in a pod. Used to detect pods running across a restart
+ containerMap containermap.ContainerMap
+
+ // containerRunningSet identifies which container among those present in `containerMap`
+ // was reported running by the container runtime when `containerMap` was computed.
+ // Used to detect pods running across a restart
+ containerRunningSet sets.Set[string]
+}
+
+// NewManagerStub creates a ManagerStub.
+func NewManagerStub() (*ManagerStub, error) {
+ return &ManagerStub{}, nil
+}
+
+// Start simply returns nil.
+func (h *ManagerStub) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.Set[string]) error {
+ return nil
+}
+
+// Stop simply returns nil.
+func (h *ManagerStub) Stop() error {
+ return nil
+}
+
+// Allocate simply returns nil.
+func (h *ManagerStub) Allocate(pod *v1.Pod, container *v1.Container) error {
+ return nil
+}
+
+// UpdatePluginResources simply returns nil.
+func (h *ManagerStub) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
+ return nil
+}
+
+// GetDeviceRunContainerOptions simply returns nil.
+func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) {
+ return nil, nil
+}
+
+// GetCapacity simply returns nil capacity and empty removed resource list.
+func (h *ManagerStub) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
+ return nil, nil, []string{}
+}
+
+// GetWatcherHandler returns plugin watcher interface
+func (h *ManagerStub) GetWatcherHandler() cache.PluginHandler {
+ return nil
+}
+
+// GetTopologyHints returns an empty TopologyHint map
+func (h *ManagerStub) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
+ return map[string][]topologymanager.TopologyHint{}
+}
+
+// GetPodTopologyHints returns an empty TopologyHint map
+func (h *ManagerStub) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
+ return map[string][]topologymanager.TopologyHint{}
+}
+
+// GetDevices returns nil
+func (h *ManagerStub) GetDevices(_, _ string) ResourceDeviceInstances {
+ return nil
+}
+
+// GetAllocatableDevices returns nothing
+func (h *ManagerStub) GetAllocatableDevices() ResourceDeviceInstances {
+ return nil
+}
+
+// ShouldResetExtendedResourceCapacity returns false
+func (h *ManagerStub) ShouldResetExtendedResourceCapacity() bool {
+ return false
+}
+
+// UpdateAllocatedDevices returns nothing
+func (h *ManagerStub) UpdateAllocatedDevices() {
+ return
+}
--
2.25.1

View File

@ -0,0 +1,366 @@
From bd2514a9b62c61f2e98f199de98dca76348a8891 Mon Sep 17 00:00:00 2001
From: Boovan Rajendran <boovan.rajendran@windriver.com>
Date: Wed, 6 Mar 2024 03:50:23 -0500
Subject: [PATCH] kubelet cpumanager keep normal containers off reserved CPUs
When starting the kubelet process, two separate sets of reserved CPUs
may be specified. With this change CPUs reserved via
'--system-reserved=cpu'
or '--kube-reserved=cpu' will be ignored by kubernetes itself. This
explicitly excludes the reserved CPUS from the DefaultCPUset so
that pods cannot run there.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
---
pkg/kubelet/cm/cpumanager/cpu_manager.go | 6 ++-
pkg/kubelet/cm/cpumanager/cpu_manager_test.go | 19 ++++++---
pkg/kubelet/cm/cpumanager/policy_static.go | 30 +++++++++++---
.../cm/cpumanager/policy_static_test.go | 40 ++++++++++++++-----
4 files changed, 72 insertions(+), 23 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go
index 2d1b6eefebe..e0a359932b7 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go
@@ -192,7 +192,11 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions)
+ // NOTE: Set excludeReserved unconditionally to exclude reserved CPUs from default cpuset.
+ // This variable is primarily to make testing easier.
+ excludeReserved := true
+ policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions, excludeReserved)
+
if err != nil {
return nil, fmt.Errorf("new static policy error: %w", err)
}
diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
index f6563bbca23..e5f5d07a2ad 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_manager_test.go
@@ -263,6 +263,7 @@ func makeMultiContainerPodWithOptions(initCPUs, appCPUs []*containerOptions) *v1
}
func TestCPUManagerAdd(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -278,7 +279,8 @@ func TestCPUManagerAdd(t *testing.T) {
0,
cpuset.New(),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -527,8 +529,9 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
},
}
+ testExcl := false
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl)
mockState := &mockState{
assignments: testCase.stAssignments,
@@ -753,6 +756,7 @@ func TestCPUManagerRemove(t *testing.T) {
}
func TestReconcileState(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 8,
@@ -772,7 +776,8 @@ func TestReconcileState(t *testing.T) {
0,
cpuset.New(),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
@@ -1276,6 +1281,7 @@ func TestReconcileState(t *testing.T) {
// above test cases are without kubelet --reserved-cpus cmd option
// the following tests are with --reserved-cpus configured
func TestCPUManagerAddWithResvList(t *testing.T) {
+ testExcl := false
testPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
NumCPUs: 4,
@@ -1291,7 +1297,8 @@ func TestCPUManagerAddWithResvList(t *testing.T) {
1,
cpuset.New(0),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
updateErr error
@@ -1416,6 +1423,7 @@ func TestCPUManagerHandlePolicyOptions(t *testing.T) {
}
func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
+ testExcl := false
nonePolicy, _ := NewNonePolicy(nil)
staticPolicy, _ := NewStaticPolicy(
&topology.CPUTopology{
@@ -1432,7 +1440,8 @@ func TestCPUManagerGetAllocatableCPUs(t *testing.T) {
1,
cpuset.New(0),
topologymanager.NewFakeManager(),
- nil)
+ nil,
+ testExcl)
testCases := []struct {
description string
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index d22a6a64d5e..9690eedab58 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -106,6 +106,8 @@ type staticPolicy struct {
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reservedCPUs cpuset.CPUSet
+ // If true, default CPUSet should exclude reserved CPUs
+ excludeReserved bool
// Superset of reservedCPUs. It includes not just the reservedCPUs themselves,
// but also any siblings of those reservedCPUs on the same physical die.
// NOTE: If the reserved set includes full physical CPUs from the beginning
@@ -126,7 +128,7 @@ var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
-func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) {
+func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string, excludeReserved bool) (Policy, error) {
opts, err := NewStaticPolicyOptions(cpuPolicyOptions)
if err != nil {
return nil, err
@@ -141,6 +143,7 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
policy := &staticPolicy{
topology: topology,
affinity: affinity,
+ excludeReserved: excludeReserved,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
}
@@ -202,7 +205,15 @@ func (p *staticPolicy) validateState(s state.State) error {
}
// state is empty initialize
allCPUs := p.topology.CPUDetails.CPUs()
- s.SetDefaultCPUSet(allCPUs)
+ if p.excludeReserved {
+ // Exclude reserved CPUs from the default CPUSet to keep containers off them
+ // unless explicitly affined.
+ s.SetDefaultCPUSet(allCPUs.Difference(p.reservedCPUs))
+ } else {
+ s.SetDefaultCPUSet(allCPUs)
+ }
+ klog.Infof("[cpumanager] static policy: CPUSet: allCPUs:%v, reserved:%v, default:%v\n",
+ allCPUs, p.reservedCPUs, s.GetDefaultCPUSet())
return nil
}
@@ -210,11 +221,12 @@ func (p *staticPolicy) validateState(s state.State) error {
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
- if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) {
- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
- p.reservedCPUs.String(), tmpDefaultCPUset.String())
+ if !p.excludeReserved {
+ if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) {
+ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
+ p.reservedCPUs.String(), tmpDefaultCPUset.String())
+ }
}
-
// 2. Check if state for static policy is consistent
for pod := range tmpAssignments {
for container, cset := range tmpAssignments[pod] {
@@ -241,6 +253,9 @@ func (p *staticPolicy) validateState(s state.State) error {
}
}
totalKnownCPUs = totalKnownCPUs.Union(tmpCPUSets...)
+ if p.excludeReserved {
+ totalKnownCPUs = totalKnownCPUs.Union(p.reservedCPUs)
+ }
if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String())
@@ -381,6 +396,9 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName)
if toRelease, ok := s.GetCPUSet(podUID, containerName); ok {
s.Delete(podUID, containerName)
+ if p.excludeReserved {
+ toRelease = toRelease.Difference(p.reservedCPUs)
+ }
// Mutate the shared pool, adding released cpus.
toRelease = toRelease.Difference(cpusInUse)
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index 8cc3fc2be74..b060201e156 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -36,6 +36,7 @@ type staticPolicyTest struct {
description string
topo *topology.CPUTopology
numReservedCPUs int
+ excludeReserved bool
reservedCPUs *cpuset.CPUSet
podUID string
options map[string]string
@@ -69,7 +70,8 @@ func (spt staticPolicyTest) PseudoClone() staticPolicyTest {
}
func TestStaticPolicyName(t *testing.T) {
- policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ testExcl := false
+ policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.New(), topologymanager.NewFakeManager(), nil, testExcl)
policyName := policy.Name()
if policyName != "static" {
@@ -99,6 +101,15 @@ func TestStaticPolicyStart(t *testing.T) {
stDefaultCPUSet: cpuset.New(),
expCSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
},
+ {
+ description: "empty cpuset exclude reserved",
+ topo: topoDualSocketHT,
+ numReservedCPUs: 2,
+ excludeReserved: true,
+ stAssignments: state.ContainerCPUAssignments{},
+ stDefaultCPUSet: cpuset.New(),
+ expCSet: cpuset.New(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
+ },
{
description: "reserved cores 0 & 6 are not present in available cpuset",
topo: topoDualSocketHT,
@@ -145,7 +156,8 @@ func TestStaticPolicyStart(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
+
policy := p.(*staticPolicy)
st := &mockState{
assignments: testCase.stAssignments,
@@ -614,7 +626,8 @@ func runStaticPolicyTestCase(t *testing.T, testCase staticPolicyTest) {
if testCase.reservedCPUs != nil {
cpus = testCase.reservedCPUs.Clone()
}
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options)
+ testExcl := false
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpus, tm, testCase.options, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
@@ -685,7 +698,7 @@ func TestStaticPolicyReuseCPUs(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -738,7 +751,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, testCase.excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -762,6 +775,7 @@ func TestStaticPolicyDoNotReuseCPUs(t *testing.T) {
}
func TestStaticPolicyRemove(t *testing.T) {
+ excludeReserved := false
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
@@ -820,7 +834,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved)
st := &mockState{
assignments: testCase.stAssignments,
@@ -842,6 +856,7 @@ func TestStaticPolicyRemove(t *testing.T) {
}
func TestTopologyAwareAllocateCPUs(t *testing.T) {
+ excludeReserved := false
testCases := []struct {
description string
topo *topology.CPUTopology
@@ -910,7 +925,7 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
},
}
for _, tc := range testCases {
- p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil)
+ p, _ := NewStaticPolicy(tc.topo, 0, cpuset.New(), topologymanager.NewFakeManager(), nil, excludeReserved)
policy := p.(*staticPolicy)
st := &mockState{
assignments: tc.stAssignments,
@@ -982,9 +997,11 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
},
}
+ testExcl := false
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {
- p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil)
+ p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
+
if !reflect.DeepEqual(err, testCase.expNewErr) {
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expNewErr, err)
@@ -1024,7 +1041,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 1,
reserved: cpuset.New(0),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request: requested=8, available=7"),
expCPUAlloc: false,
@@ -1036,7 +1053,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
numReservedCPUs: 2,
reserved: cpuset.New(0, 1),
stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7),
+ stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7),
pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
expErr: nil,
expCPUAlloc: true,
@@ -1060,8 +1077,9 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
},
}
+ testExcl := true
for _, testCase := range testCases {
- policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil)
+ policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager(), nil, testExcl)
st := &mockState{
assignments: testCase.stAssignments,
--
2.25.1

View File

@ -0,0 +1,556 @@
From 4a28be032c61624f9b16d69ae3a7f699b6a62f51 Mon Sep 17 00:00:00 2001
From: Boovan Rajendran <boovan.rajendran@windriver.com>
Date: Tue, 5 Sep 2023 06:27:39 -0400
Subject: [PATCH] Namespace-Label-Based Pod Identification and CPU Management
for Infra Pods
This change assigns system infrastructure pods to the "reserved"
cpuset to isolate them from the shared pool of CPUs. Kubernetes
infrastructure pods are identified based on namespace 'kube-system'.
Platform pods are identified based on namespace 'kube-system',
or label with 'app.starlingx.io/component=platform'.
The platform and infrastructure pods are given an isolated CPU
affinity cpuset when the CPU manager is configured "with the 'static'
policy."
This implementation assumes that the "reserved" cpuset is large
enough to handle all infrastructure and platform pod's CPU
allocations, and it prevents the platform pods from running on
application/isolated CPUs regardless of what QoS class they're in.
Co-authored-by: Jim Gauld <james.gauld@windriver.com>
Signed-off-by: Gleb Aronsky <gleb.aronsky@windriver.com>
Signed-off-by: Thiago Miranda <ThiagoOliveira.Miranda@windriver.com>
Signed-off-by: Kaustubh Dhokte <kaustubh.dhokte@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
---
pkg/kubelet/cm/cpumanager/policy_static.go | 121 ++++++++-
.../cm/cpumanager/policy_static_test.go | 233 +++++++++++++++---
.../cm/cpumanager/topology_hints_test.go | 4 +
3 files changed, 316 insertions(+), 42 deletions(-)
diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go
index 9690eedab58..fff571491f0 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -17,11 +17,17 @@ limitations under the License.
package cpumanager
import (
+ "context"
"fmt"
v1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
+ k8sclient "k8s.io/client-go/kubernetes"
+ restclient "k8s.io/client-go/rest"
+ "k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
+ "k8s.io/kubernetes/cmd/kubeadm/app/constants"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
@@ -44,6 +50,22 @@ const (
ErrorSMTAlignment = "SMTAlignmentError"
)
+// Declared as variables so that they can easily more
+// overridden during testing
+type getPodNamespace func(string) (*v1.Namespace, error)
+type buildFromConfigFlag func(masterUrl string, kubeconfigPath string) (*restclient.Config, error)
+type isKubeInfraFunc func(pod *v1.Pod) bool
+
+var varGetNamespaceObject getPodNamespace
+var varBuildConfigFromFlags buildFromConfigFlag
+var varIsKubeInfra isKubeInfraFunc
+
+func init() {
+ varIsKubeInfra = isKubeInfra
+ varGetNamespaceObject = getPodNamespaceObject
+ varBuildConfigFromFlags = clientcmd.BuildConfigFromFlags
+}
+
// SMTAlignmentError represents an error due to SMT alignment
type SMTAlignmentError struct {
RequestedCPUs int
@@ -141,11 +163,11 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
klog.InfoS("Static policy created with configuration", "options", opts)
policy := &staticPolicy{
- topology: topology,
- affinity: affinity,
+ topology: topology,
+ affinity: affinity,
excludeReserved: excludeReserved,
- cpusToReuse: make(map[string]cpuset.CPUSet),
- options: opts,
+ cpusToReuse: make(map[string]cpuset.CPUSet),
+ options: opts,
}
allCPUs := topology.CPUDetails.CPUs()
@@ -223,8 +245,8 @@ func (p *staticPolicy) validateState(s state.State) error {
// - user tampered with file
if !p.excludeReserved {
if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) {
- return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
- p.reservedCPUs.String(), tmpDefaultCPUset.String())
+ return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
+ p.reservedCPUs.String(), tmpDefaultCPUset.String())
}
}
// 2. Check if state for static policy is consistent
@@ -309,6 +331,25 @@ func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, c
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
+ // Process infra pods before guaranteed pods
+ if varIsKubeInfra(pod) {
+ // Container belongs in reserved pool.
+ // We don't want to fall through to the p.guaranteedCPUs() clause below so return either nil or error.
+ if _, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+ klog.Infof("[cpumanager] static policy: reserved container already present in state, skipping (namespace: %s, pod UID: %s, pod: %s, container: %s)", pod.Namespace, string(pod.UID), pod.Name, container.Name)
+ return nil
+ }
+
+ cpuset := p.reservedCPUs
+ if cpuset.IsEmpty() {
+ // If this happens then someone messed up.
+ return fmt.Errorf("[cpumanager] static policy: reserved container unable to allocate cpus (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v, reserved:%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset, p.reservedCPUs)
+ }
+ s.SetCPUSet(string(pod.UID), container.Name, cpuset)
+ klog.Infof("[cpumanager] static policy: reserved: AddContainer (namespace: %s, pod UID: %s, pod: %s, container: %s); cpuset=%v", pod.Namespace, string(pod.UID), pod.Name, container.Name, cpuset)
+ return nil
+ }
+
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
@@ -460,6 +501,10 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
+ // Infrastructure pods use reserved CPUs even if they're in the Guaranteed QoS class
+ if varIsKubeInfra(pod) {
+ return 0
+ }
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
@@ -685,6 +730,70 @@ func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reu
return hints
}
+func getPodNamespaceObject(podNamespaceName string) (*v1.Namespace, error) {
+
+ kubeConfigPath := constants.GetKubeletKubeConfigPath()
+ cfg, err := varBuildConfigFromFlags("", kubeConfigPath)
+ if err != nil {
+ klog.Error("Failed to build client config from ", kubeConfigPath, err.Error())
+ return nil, err
+ }
+
+ clientset, err := k8sclient.NewForConfig(cfg)
+ if err != nil {
+ klog.Error("Failed to get clientset for KUBECONFIG ", kubeConfigPath, err.Error())
+ return nil, err
+ }
+
+ namespaceObj, err := clientset.CoreV1().Namespaces().Get(context.TODO(), podNamespaceName, metav1.GetOptions{})
+ if err != nil {
+ klog.Error("Error getting namespace object:", err.Error())
+ return nil, err
+ }
+
+ return namespaceObj, nil
+
+}
+
+// check if a given pod is labelled as platform pod or
+// is in a namespace labelled as a platform namespace
+func isKubeInfra(pod *v1.Pod) bool {
+
+ podName := pod.GetName()
+ podNamespaceName := pod.GetNamespace()
+
+ if podNamespaceName == "kube-system" {
+ klog.Infof("Pod %s has %s namespace. Treating as platform pod.", podName, podNamespaceName)
+ return true
+ }
+
+ klog.InfoS("Checking pod ", podName, " for label 'app.starlingx.io/component=platform'.")
+ podLabels := pod.GetLabels()
+ val, ok := podLabels["app.starlingx.io/component"]
+ if ok && val == "platform" {
+ klog.InfoS("Pod ", podName, " has 'app.starlingx.io/component=platform' label. Treating as platform pod.")
+ return true
+ }
+
+ klog.V(4).InfoS("Pod ", pod.GetName(), " does not have 'app.starlingx.io/component=platform' label. Checking its namespace information...")
+
+ namespaceObj, err := varGetNamespaceObject(podNamespaceName)
+ if err != nil {
+ return false
+ }
+
+ namespaceLabels := namespaceObj.GetLabels()
+ val, ok = namespaceLabels["app.starlingx.io/component"]
+ if ok && val == "platform" {
+ klog.InfoS("For pod: ", podName, ", its Namespace ", podNamespaceName, " has 'app.starlingx.io/component=platform' label. Treating as platform pod.")
+ return true
+ }
+
+ klog.InfoS("Neither pod ", podName, " nor its namespace ", podNamespaceName, " has 'app.starlingx.io/component=platform' label. Not treating as platform pod.")
+ return false
+
+}
+
// isHintSocketAligned function return true if numa nodes in hint are socket aligned.
func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool {
numaNodesBitMask := hint.NUMANodeAffinity.GetBits()
diff --git a/pkg/kubelet/cm/cpumanager/policy_static_test.go b/pkg/kubelet/cm/cpumanager/policy_static_test.go
index b060201e156..83614619550 100644
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -17,12 +17,15 @@ limitations under the License.
package cpumanager
import (
+ "errors"
"fmt"
"reflect"
"testing"
v1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
+ restclient "k8s.io/client-go/rest"
featuregatetesting "k8s.io/component-base/featuregate/testing"
pkgfeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
@@ -109,7 +112,7 @@ func TestStaticPolicyStart(t *testing.T) {
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.New(),
expCSet: cpuset.New(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
- },
+ },
{
description: "reserved cores 0 & 6 are not present in available cpuset",
topo: topoDualSocketHT,
@@ -954,17 +957,18 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
// above test cases are without kubelet --reserved-cpus cmd option
// the following tests are with --reserved-cpus configured
type staticPolicyTestWithResvList struct {
- description string
- topo *topology.CPUTopology
- numReservedCPUs int
- reserved cpuset.CPUSet
- stAssignments state.ContainerCPUAssignments
- stDefaultCPUSet cpuset.CPUSet
- pod *v1.Pod
- expErr error
- expNewErr error
- expCPUAlloc bool
- expCSet cpuset.CPUSet
+ description string
+ topo *topology.CPUTopology
+ numReservedCPUs int
+ reserved cpuset.CPUSet
+ stAssignments state.ContainerCPUAssignments
+ stDefaultCPUSet cpuset.CPUSet
+ pod *v1.Pod
+ isKubeInfraPodfunc isKubeInfraFunc
+ expErr error
+ expNewErr error
+ expCPUAlloc bool
+ expCSet cpuset.CPUSet
}
func TestStaticPolicyStartWithResvList(t *testing.T) {
@@ -1032,35 +1036,63 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
}
}
-func TestStaticPolicyAddWithResvList(t *testing.T) {
+func fakeIsKubeInfraTrue(pod *v1.Pod) bool {
+ return true
+}
+func fakeIsKubeInfraFalse(pod *v1.Pod) bool {
+ return false
+}
+
+func TestStaticPolicyAddWithResvList(t *testing.T) {
+ infraPod := makePod("fakePod", "fakeContainer2", "200m", "200m")
+ infraPod.Namespace = "kube-system"
testCases := []staticPolicyTestWithResvList{
{
- description: "GuPodSingleCore, SingleSocketHT, ExpectError",
- topo: topoSingleSocketHT,
- numReservedCPUs: 1,
- reserved: cpuset.New(0),
- stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7),
- pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
- expErr: fmt.Errorf("not enough cpus available to satisfy request: requested=8, available=7"),
- expCPUAlloc: false,
- expCSet: cpuset.New(),
+ description: "GuPodSingleCore, SingleSocketHT, ExpectError",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 1,
+ reserved: cpuset.New(0),
+ stAssignments: state.ContainerCPUAssignments{},
+ stDefaultCPUSet: cpuset.New(1, 2, 3, 4, 5, 6, 7),
+ pod: makePod("fakePod", "fakeContainer2", "8000m", "8000m"),
+ isKubeInfraPodfunc: fakeIsKubeInfraFalse,
+ expErr: fmt.Errorf("not enough cpus available to satisfy request: requested=8, available=7"),
+ expCPUAlloc: false,
+ expCSet: cpuset.New(),
},
{
- description: "GuPodSingleCore, SingleSocketHT, ExpectAllocOneCPU",
+ description: "GuPodSingleCore, SingleSocketHT, ExpectAllocOneCPU",
+ topo: topoSingleSocketHT,
+ numReservedCPUs: 2,
+ reserved: cpuset.New(0, 1),
+ stAssignments: state.ContainerCPUAssignments{},
+ stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7),
+ pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
+ isKubeInfraPodfunc: fakeIsKubeInfraFalse,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.New(4), // expect sibling of partial core
+ },
+ {
+ description: "GuPodMultipleCores, SingleSocketHT, ExpectAllocOneCore",
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.New(0, 1),
- stAssignments: state.ContainerCPUAssignments{},
- stDefaultCPUSet: cpuset.New(2, 3, 4, 5, 6, 7),
- pod: makePod("fakePod", "fakeContainer2", "1000m", "1000m"),
- expErr: nil,
- expCPUAlloc: true,
- expCSet: cpuset.New(4), // expect sibling of partial core
+ stAssignments: state.ContainerCPUAssignments{
+ "fakePod": map[string]cpuset.CPUSet{
+ "fakeContainer100": cpuset.New(2, 3, 6, 7),
+ },
+ },
+ stDefaultCPUSet: cpuset.New(0, 1, 4, 5),
+ pod: makePod("fakePod", "fakeContainer3", "2000m", "2000m"),
+ isKubeInfraPodfunc: fakeIsKubeInfraFalse,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.New(4, 5),
},
{
- description: "GuPodMultipleCores, SingleSocketHT, ExpectAllocOneCore",
+ description: "InfraPod, SingleSocketHT, ExpectAllocReserved",
topo: topoSingleSocketHT,
numReservedCPUs: 2,
reserved: cpuset.New(0, 1),
@@ -1069,11 +1101,12 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
"fakeContainer100": cpuset.New(2, 3, 6, 7),
},
},
- stDefaultCPUSet: cpuset.New(0, 1, 4, 5),
- pod: makePod("fakePod", "fakeContainer3", "2000m", "2000m"),
- expErr: nil,
- expCPUAlloc: true,
- expCSet: cpuset.New(4, 5),
+ stDefaultCPUSet: cpuset.New(4, 5),
+ pod: infraPod,
+ isKubeInfraPodfunc: fakeIsKubeInfraTrue,
+ expErr: nil,
+ expCPUAlloc: true,
+ expCSet: cpuset.New(0, 1),
},
}
@@ -1086,6 +1119,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
defaultCPUSet: testCase.stDefaultCPUSet,
}
+ varIsKubeInfra = testCase.isKubeInfraPodfunc
container := &testCase.pod.Spec.Containers[0]
err := policy.Allocate(st, testCase.pod, container)
if !reflect.DeepEqual(err, testCase.expErr) {
@@ -1206,6 +1240,133 @@ func TestStaticPolicyOptions(t *testing.T) {
}
}
+func makePodWithLabels(podLabels map[string]string) *v1.Pod {
+ return &v1.Pod{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-pod",
+ Namespace: "test-namespace",
+ Labels: podLabels,
+ },
+ }
+}
+
+func fakeBuildConfigFromFlags(masterUrl string, kubeconfigPath string) (*restclient.Config, error) {
+
+ return &restclient.Config{}, nil
+}
+
+func fakeBuildConfigFromFlagsError(masterUrl string, kubeconfigPath string) (*restclient.Config, error) {
+
+ errString := fmt.Sprintf("%s file not found", kubeconfigPath)
+ return nil, errors.New(errString)
+
+}
+
+func getFakeInfraPodNamespace(_ string) (*v1.Namespace, error) {
+
+ return &v1.Namespace{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-namespace",
+ Labels: map[string]string{
+ "app.starlingx.io/component": "platform",
+ },
+ }}, nil
+}
+
+func getFakeNonInfraPodNamespace(_ string) (*v1.Namespace, error) {
+
+ return &v1.Namespace{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-namespace",
+ Labels: map[string]string{
+ "fake": "label",
+ }}}, nil
+
+}
+
+type kubeInfraPodTestCase struct {
+ description string
+ pod *v1.Pod
+ namespaceFunc getPodNamespace
+ expectedValue bool
+}
+
+func TestKubeInfraPod(t *testing.T) {
+ testCases := []kubeInfraPodTestCase{
+ {
+ description: "Pod with platform label and namespace with platform label",
+ pod: makePodWithLabels(map[string]string{
+ "app.starlingx.io/component": "platform",
+ }),
+ namespaceFunc: getFakeInfraPodNamespace,
+ expectedValue: true,
+ },
+ {
+ description: "Pod with platform label and namespace without platform label",
+ pod: makePodWithLabels(map[string]string{
+ "app.starlingx.io/component": "platform",
+ }),
+ namespaceFunc: getFakeNonInfraPodNamespace,
+ expectedValue: true,
+ },
+ {
+ description: "Pod without platform label and namespace with platform label",
+ pod: makePodWithLabels(map[string]string{
+ "test": "label",
+ }),
+ namespaceFunc: getFakeInfraPodNamespace,
+ expectedValue: true,
+ },
+ {
+ description: "Pod without platform label and namespace without platform label",
+ pod: makePodWithLabels(map[string]string{
+ "test": "namespace",
+ }),
+ namespaceFunc: getFakeNonInfraPodNamespace,
+ expectedValue: false,
+ },
+ }
+
+ for _, testCase := range testCases {
+ t.Run(testCase.description, func(t *testing.T) {
+
+ varGetNamespaceObject = testCase.namespaceFunc
+ varBuildConfigFromFlags = fakeBuildConfigFromFlags
+ gotValue := isKubeInfra(testCase.pod)
+
+ if gotValue != testCase.expectedValue {
+ t.Errorf("StaticPolicy isKubeInfraPod() error %v. expected value %v actual value %v",
+ testCase.description, testCase.expectedValue, gotValue)
+ } else {
+ fmt.Printf("StaticPolicy isKubeInfraPod() test successful. : %v ", testCase.description)
+ }
+
+ })
+ }
+
+ test := kubeInfraPodTestCase{
+ description: "Failure reading kubeconfig file",
+ pod: makePodWithLabels(map[string]string{
+ "test": "namespace",
+ }),
+ namespaceFunc: getFakeNonInfraPodNamespace,
+ expectedValue: false,
+ }
+
+ varGetNamespaceObject = getPodNamespaceObject
+ varBuildConfigFromFlags = fakeBuildConfigFromFlagsError
+
+ gotValue := isKubeInfra(test.pod)
+
+ if gotValue != test.expectedValue {
+ t.Errorf("StaticPolicy isKubeInfraPod() error %v. expected value %v actual value %v",
+ test.description, test.expectedValue, gotValue)
+ } else {
+ fmt.Printf("StaticPolicy isKubeInfraPod() test successful. : %v ", test.description)
+ }
+
+}
+
func newCPUSetPtr(cpus ...int) *cpuset.CPUSet {
ret := cpuset.New(cpus...)
return &ret
diff --git a/pkg/kubelet/cm/cpumanager/topology_hints_test.go b/pkg/kubelet/cm/cpumanager/topology_hints_test.go
index 53738b613c2..ad9c0f17602 100644
--- a/pkg/kubelet/cm/cpumanager/topology_hints_test.go
+++ b/pkg/kubelet/cm/cpumanager/topology_hints_test.go
@@ -197,6 +197,7 @@ func TestPodGuaranteedCPUs(t *testing.T) {
expectedCPU: 210,
},
}
+ varIsKubeInfra = fakeIsKubeInfraFalse
for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
requestedCPU := p.podGuaranteedCPUs(tc.pod)
@@ -241,6 +242,7 @@ func TestGetTopologyHints(t *testing.T) {
sourcesReady: &sourcesReadyStub{},
}
+ varIsKubeInfra = fakeIsKubeInfraFalse
hints := m.GetTopologyHints(&tc.pod, &tc.container)[string(v1.ResourceCPU)]
if len(tc.expectedHints) == 0 && len(hints) == 0 {
continue
@@ -294,6 +296,7 @@ func TestGetPodTopologyHints(t *testing.T) {
sourcesReady: &sourcesReadyStub{},
}
+ varIsKubeInfra = fakeIsKubeInfraFalse
podHints := m.GetPodTopologyHints(&tc.pod)[string(v1.ResourceCPU)]
if len(tc.expectedHints) == 0 && len(podHints) == 0 {
continue
@@ -477,6 +480,7 @@ func TestGetPodTopologyHintsWithPolicyOptions(t *testing.T) {
sourcesReady: &sourcesReadyStub{},
}
+ varIsKubeInfra = fakeIsKubeInfraFalse
podHints := m.GetPodTopologyHints(&testCase.pod)[string(v1.ResourceCPU)]
sort.SliceStable(podHints, func(i, j int) bool {
return podHints[i].LessThan(podHints[j])
--
2.25.1

View File

@ -0,0 +1,50 @@
From 75eff86c66aa8fac96b7d102d339c1a50b0ad205 Mon Sep 17 00:00:00 2001
From: Jim Gauld <James.Gauld@windriver.com>
Date: Fri, 11 Feb 2022 11:06:35 -0500
Subject: [PATCH] kubelet: sort isolcpus allocation when SMT enabled
The existing device manager code returns CPUs as devices in unsorted
order. This numerically sorts isolcpus allocations when SMT/HT is
enabled on the host. This logs SMT pairs, singletons, and algorithm
order details to make the algorithm understandable.
Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index 8a488a9292f..6a6e1195bcf 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -568,7 +568,16 @@ func order_devices_by_sibling(devices sets.Set[string], needed int) ([]string, e
return cpu_lst[0]
}
}
+ //Make post-analysis of selection algorithm obvious by numerical sorting
+ //the available isolated cpu_id.
+ cpu_ids := make([]int, 0, int(devices.Len()))
for cpu_id := range devices {
+ cpu_id_, _ := strconv.Atoi(cpu_id)
+ cpu_ids = append(cpu_ids, cpu_id_)
+ }
+ sort.Ints(cpu_ids)
+ for _, _cpu_id := range cpu_ids {
+ cpu_id := strconv.Itoa(_cpu_id)
// If we've already found cpu_id as a sibling, skip it.
if _, ok := _iterated_cpu[cpu_id]; ok {
continue
@@ -610,7 +619,9 @@ func order_devices_by_sibling(devices sets.Set[string], needed int) ([]string, e
}
}
}
- //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ //This algorithm will get some attention. Show minimal details.
+ klog.Infof("order_devices_by_sibling: needed=%d, smtpairs=%v, singletons=%v, order=%v",
+ needed, sibling_lst, single_lst, dev_lst)
return dev_lst, nil
}
func smt_enabled() bool {
--
2.25.1

View File

@ -0,0 +1,150 @@
From 67797d74a1f57e0983ca9457ac8954406bf8b183 Mon Sep 17 00:00:00 2001
From: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Date: Wed, 8 Jan 2025 08:27:30 -0500
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
Enhance isolcpus support in Kubernetes to allocate isolated SMT
siblings to the same container when SMT/HT is enabled on the host.
As it stands, the device manager code in Kubernetes is not SMT-aware
(since normally it doesn't deal with CPUs). However, StarlingX
exposes isolated CPUs as devices and if possible we want to allocate
all SMT siblings from a CPU core to the same container in order to
minimize cross- container interference due to resource contention
within the CPU core.
The solution is basically to take the list of isolated CPUs and
re-order it so that the SMT siblings are next to each other. That
way the existing resource selection code will allocate the siblings
together. As an optimization, if it is known that an odd number
of isolated CPUs are desired, a singleton SMT sibling will be
inserted into the list to avoid breaking up sibling pairs.
Signed-off-by: Tao Wang <tao.wang@windriver.com>
Signed-off-by: Ramesh Kumar Sivanandam <rameshkumar.sivanandam@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
---
pkg/kubelet/cm/devicemanager/manager.go | 83 ++++++++++++++++++++++++-
1 file changed, 82 insertions(+), 1 deletion(-)
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
index f1d04e97179..8a488a9292f 100644
--- a/pkg/kubelet/cm/devicemanager/manager.go
+++ b/pkg/kubelet/cm/devicemanager/manager.go
@@ -23,6 +23,8 @@ import (
"path/filepath"
"runtime"
"sort"
+ "strconv"
+ "strings"
"sync"
"time"
@@ -46,6 +48,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/types"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
+ "k8s.io/utils/cpuset"
)
const nodeWithoutTopology = -1
@@ -550,6 +553,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
m.allocatedDevices = m.podDevices.devices()
}
+// Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
+// return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
+// contain as many hyperthread sibling pairs as possible.
+func order_devices_by_sibling(devices sets.Set[string], needed int) ([]string, error) {
+ var dev_lst []string
+ var single_lst []string
+ sibling_lst := make([]string, 0, int(devices.Len()))
+ _iterated_cpu := make(map[string]string)
+ get_sibling := func(cpu string, cpu_lst []string) string {
+ if cpu_lst[0] == cpu {
+ return cpu_lst[1]
+ } else {
+ return cpu_lst[0]
+ }
+ }
+ for cpu_id := range devices {
+ // If we've already found cpu_id as a sibling, skip it.
+ if _, ok := _iterated_cpu[cpu_id]; ok {
+ continue
+ }
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
+ dat, err := os.ReadFile(devPath)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
+ }
+ cpustring := strings.TrimSuffix(string(dat), "\n")
+ cpu_pair_set, err := cpuset.Parse(cpustring)
+ if err != nil {
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
+ }
+ var cpu_pair_lst []string
+ for _, v := range cpu_pair_set.List() {
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
+ }
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
+ if _, ok := devices[sibling_cpu_id]; ok {
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
+ _iterated_cpu[sibling_cpu_id] = ""
+ } else {
+ single_lst = append(single_lst, cpu_id)
+ }
+ _iterated_cpu[cpu_id] = ""
+ }
+ if needed%2 == 0 {
+ dev_lst = append(sibling_lst, single_lst...)
+ } else {
+ if len(single_lst) > 1 {
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
+ dev_lst = append(single_lst[0:1], _tmp_list...)
+ } else {
+ if len(single_lst) == 0 {
+ dev_lst = sibling_lst
+ } else {
+ dev_lst = append(single_lst, sibling_lst...)
+ }
+ }
+ }
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
+ return dev_lst, nil
+}
+func smt_enabled() bool {
+ dat, _ := os.ReadFile("/sys/devices/system/cpu/smt/active")
+ state := strings.TrimSuffix(string(dat), "\n")
+ if state == "0" {
+ return false
+ }
+ return true
+}
+
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.Set[string]) (sets.Set[string], error) {
@@ -630,7 +702,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
if m.allocatedDevices[resource] == nil {
m.allocatedDevices[resource] = sets.New[string]()
}
- for device := range devices.Difference(allocated) {
+ availableDevices := sets.List[string](devices.Difference(allocated))
+ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together.
+ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() {
+ var err error
+ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed)
+ if err != nil {
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
+ }
+ }
+ for _, device := range availableDevices {
m.allocatedDevices[resource].Insert(device)
allocated.Insert(device)
needed--
--
2.25.1

View File

@ -0,0 +1,10 @@
kubeadm-create-platform-pods-with-zero-CPU-resources.patch
kubernetes-make-isolcpus-allocation-SMT-aware.patch
kubelet-sort-isolcpus-allocation-when-SMT-enabled.patch
kubelet-cpumanager-disable-CFS-quota-throttling.patch
kubelet-cpumanager-keep-normal-containers-off-reserv.patch
kubelet-cpumanager-platform-pods-on-reserved-cpus.patch
kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
kubelet-CFS-quota-throttling-for-non-integer-cpulimit.patch
kubeadm-reduce-UpgradeManifestTimeout.patch
kubeadm-readiness-probe-timeout-core-dns.patch