Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/GoogleCloudPlatform/container-engine-accelerators
go 1.18

require (
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201006233419-a544dbcaacb0
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
github.com/google/go-cmp v0.5.6
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jB
github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg=
github.com/Microsoft/hcsshim v0.8.10-0.20200715222032-5eafd1556990/go.mod h1:ay/0dTb7NsG8QMDfsRfLHgZo/6xAJShLe1+ePPflihk=
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82 h1:x751Xx1tdxkiA/sdkv2J769n21UbYKzVOpe9S/h1M3k=
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201006233419-a544dbcaacb0 h1:IATa/osmUz3ATg6qSzinLPqlIHBrnFjzlqEiGoqc9xI=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201006233419-a544dbcaacb0/go.mod h1:l0Cq257MSJMvg9URCXUjc8pgKY2SK1oSvIx6qG0bzzc=
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
Expand Down
108 changes: 72 additions & 36 deletions partition_gpu/partition_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strings"
"syscall"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/golang/glog"
)

Expand All @@ -32,43 +33,22 @@ var (
gpuConfigFile = flag.String("gpu-config", "/etc/nvidia/gpu_config.json", "File with GPU configurations for device plugin")
)

var partitionSizeToProfileID = map[string]string{
//nvidia-tesla-a100
"1g.5gb": "19",
"2g.10gb": "14",
"3g.20gb": "9",
"4g.20gb": "5",
"7g.40gb": "0",
//nvidia-a100-80gb
"1g.10gb": "19",
"2g.20gb": "14",
"3g.40gb": "9",
"4g.40gb": "5",
"7g.80gb": "0",
}

var partitionSizeMaxCount = map[string]int{
//nvidia-tesla-a100
"1g.5gb": 7,
"2g.10gb": 3,
"3g.20gb": 2,
"4g.20gb": 1,
"7g.40gb": 1,
//nvidia-a100-80gb
"1g.10gb": 7,
"2g.20gb": 3,
"3g.40gb": 2,
"4g.40gb": 1,
"7g.80gb": 1,
}

const SIGRTMIN = 34

// GPUConfig stores the settings used to configure the GPUs on a node.
type GPUConfig struct {
GPUPartitionSize string
}

type GPUAvailableProfiles struct {
byname map[string]GPUProfile
}

type GPUProfile struct {
id int
instances_total int
}

func main() {
flag.Parse()

Expand Down Expand Up @@ -135,6 +115,55 @@ func main() {

}

// convert a nvml response byte array to a string
func _nvmlStrToString(rawstr[96] int8) (string) {
ba := []byte{}
for _, b := range(rawstr) {
if b == 0 {
return string(ba)
}
ba = append(ba, byte(b))
}
return string(ba)
}

// list all available profiles of the requested GPU (using NVML)
func ListGpuAvailableProfiles(gpu_index int)(GPUAvailableProfiles, error) {
if err := nvml.Init(); err != nvml.SUCCESS {
glog.Fatalf("failed to initialize nvml: %v", err)
}
defer nvml.Shutdown()

profiles := GPUAvailableProfiles{ byname: make(map[string]GPUProfile) }

device, ret := nvml.DeviceGetHandleByIndex(gpu_index)
if ret != nvml.SUCCESS {
return profiles, fmt.Errorf("error getting device info: %v", nvml.ErrorString(ret))
}

for profile_id := nvml.GPU_INSTANCE_PROFILE_1_SLICE; profile_id < nvml.GPU_INSTANCE_PROFILE_COUNT; profile_id++ {
profile_v := nvml.DeviceGetGpuInstanceProfileInfoV(device, profile_id)
profile, ret := profile_v.V2()
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
return profiles, fmt.Errorf("error getting profile info: %s", nvml.ErrorString(ret))
}
profile_name_raw := _nvmlStrToString(profile.Name)
profile_name := strings.Replace(profile_name_raw, "MIG ", "", 1)
profiles.byname[profile_name] = GPUProfile{
id: int(profile.Id),
instances_total: int(profile.InstanceCount),
}
glog.Infof("profile: gpu: %v, name: %-12.12s, id: %3v, instances total: %2v",
gpu_index, profile_name, profile.Id, profile.InstanceCount)
}

return profiles, nil
}


func parseGPUConfig(gpuConfigFile string) (GPUConfig, error) {
var gpuConfig GPUConfig

Expand Down Expand Up @@ -194,7 +223,14 @@ func cleanupAllGPUPartitions() error {
}

func createGPUPartitions(partitionSize string) error {
p, err := buildPartitionStr(partitionSize)
// currently only single-gpu systems are supported
gpu_index := 0
profiles, err := ListGpuAvailableProfiles(gpu_index)
if err != nil {
return err
}

p, err := buildPartitionStr(partitionSize, profiles)
if err != nil {
return err
}
Expand All @@ -219,19 +255,19 @@ func createGPUPartitions(partitionSize string) error {

}

func buildPartitionStr(partitionSize string) (string, error) {
func buildPartitionStr(partitionSize string, profiles GPUAvailableProfiles) (string, error) {
if partitionSize == "" {
return "", nil
}

p, ok := partitionSizeToProfileID[partitionSize]
p, ok := profiles.byname[partitionSize]
if !ok {
return "", fmt.Errorf("%s is not a valid partition size", partitionSize)
}

partitionStr := p
for i := 1; i < partitionSizeMaxCount[partitionSize]; i++ {
partitionStr += fmt.Sprintf(",%s", p)
partitionStr := fmt.Sprint(p.id)
for i := 1; i < p.instances_total; i++ {
partitionStr += fmt.Sprintf(",%d", p.id)
}

return partitionStr, nil
Expand Down
27 changes: 26 additions & 1 deletion partition_gpu/partition_gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,31 @@ package main

import "testing"

var PROFILES_A100 = GPUAvailableProfiles{
byname: map[string]GPUProfile{
"1g.5gb": {
id: 19,
total: 7,
},
"2g.10gb": {
id: 14,
total: 3,
},
"3g.20gb": {
id: 9,
total: 2,
},
"4g.20gb": {
id: 5,
total: 1,
},
"7g.40gb": {
id: 0,
total: 1,
},
},
}

func Test_buildPartitionStr(t *testing.T) {
tests := []struct {
name string
Expand Down Expand Up @@ -50,7 +75,7 @@ func Test_buildPartitionStr(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := buildPartitionStr(tt.partitionSize)
got, err := buildPartitionStr(tt.partitionSize, PROFILES_A100)
if (err != nil) != tt.wantErr {
t.Errorf("buildPartitionStr() error = %v, wantErr %v", err, tt.wantErr)
return
Expand Down
24 changes: 0 additions & 24 deletions pkg/gpu/nvidia/mig/mig.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,6 @@ import (

const nvidiaDeviceRE = `^nvidia[0-9]*$`

// Max number of GPU partitions that can be created for each partition size.
// Source: https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning
var gpuPartitionSizeMaxCount = map[string]int{
//nvidia-tesla-a100
"1g.5gb": 7,
"2g.10gb": 3,
"3g.20gb": 2,
"7g.40gb": 1,
//nvidia-a100-80gb
"1g.10gb": 7,
"2g.20gb": 3,
"3g.40gb": 2,
"7g.80gb": 1,
}

// DeviceManager performs various management operations on mig devices.
type DeviceManager struct {
devDirectory string
Expand Down Expand Up @@ -83,11 +68,6 @@ func (d *DeviceManager) Start(partitionSize string) error {
return nil
}

maxPartitionCount, ok := gpuPartitionSizeMaxCount[partitionSize]
if !ok {
return fmt.Errorf("%s is not a valid GPU partition size", partitionSize)
}

d.gpuPartitionSpecs = make(map[string][]pluginapi.DeviceSpec)

nvidiaCapDir := path.Join(d.procDirectory, "driver/nvidia/capabilities")
Expand Down Expand Up @@ -192,10 +172,6 @@ func (d *DeviceManager) Start(partitionSize string) error {
}
d.gpuPartitions[gpuInstanceID] = pluginapi.Device{ID: gpuInstanceID, Health: pluginapi.Healthy}
}

if numPartitions != maxPartitionCount {
return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount)
}
}

numGPUs, err := d.discoverNumGPUs()
Expand Down
Loading