Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 64 additions & 6 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ all: presubmit

test:
@echo ">> running tests"
@$(GO) test -short -race $(pkgs)
@$(GO) test -short -race $(pkgs) -v

format:
@echo ">> formatting code"
Expand Down
66 changes: 53 additions & 13 deletions cmd/nvidia_gpu/nvidia_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ package main
import (
"flag"
"fmt"
"io/ioutil"
"net"
"os"
"path"
"regexp"
"strconv"
"strings"
"sync"
"time"

"github.com/golang/glog"
"github.com/mindprince/gonvml"
"golang.org/x/net/context"
"google.golang.org/grpc"

Expand Down Expand Up @@ -69,26 +71,61 @@ func NewNvidiaGPUManager() *nvidiaGPUManager {

// Discovers all NVIDIA GPU devices available on the local node by walking `/dev` directory.
func (ngm *nvidiaGPUManager) discoverGPUs() error {
reg := regexp.MustCompile(nvidiaDeviceRE)
files, err := ioutil.ReadDir(devDirectory)

err := gonvml.Initialize()
if err != nil {
return err
}
deviceCount, err := gonvml.DeviceCount()
if err != nil {
return err
}
for _, f := range files {
if f.IsDir() {
continue
for i := 0; i < int(deviceCount); i++ {
dev, err := gonvml.DeviceHandleByIndex(uint(i))
if err != nil {
glog.Errorf("\tDeviceHandleByIndex() error: %v\n", err)
return err
}
if reg.MatchString(f.Name()) {
glog.Infof("Found Nvidia GPU %q\n", f.Name())
ngm.devices[f.Name()] = pluginapi.Device{f.Name(), pluginapi.Healthy}

minorNumber, err := dev.MinorNumber()
if err != nil {
glog.Errorf("\tdev.MinorNumber() error: %v\n", err)
return err
}
devName := fmt.Sprintf("nvidia%d", uint(minorNumber))
glog.Infof("Found Nvidia GPU %q\n", devName)
ngm.devices[devName] = pluginapi.Device{devName, pluginapi.Healthy}
}
return nil
}

func (ngm *nvidiaGPUManager) GetDeviceState(DeviceName string) string {
// TODO: calling Nvidia tools to figure out actual device state
return pluginapi.Healthy
// TODO: This would not be necessary if we could store this value in a deviceplugin
// field like actualDevice
func convertToMinorNumber(DeviceName string) (uint, error) {
devString := strings.TrimPrefix(DeviceName, "nvidia")
minorNumber, err := strconv.ParseUint(devString, 10, 64)
if err != nil {
return 0, err
}
return uint(minorNumber), nil
}

// TODO: Consider modifying ListandWatch to concurrently check state
func (ngm *nvidiaGPUManager) GetDeviceState(DeviceName string) (string, error) {
reg := regexp.MustCompile(nvidiaDeviceRE)
if reg.MatchString(DeviceName) {
minorNumber, err := convertToMinorNumber(DeviceName)
if err != nil {
return pluginapi.Unhealthy, err
}
_, err = gonvml.DeviceHandleByIndex(minorNumber)
if err != nil {
glog.Errorf("\tCould not find GPU device %v\n", DeviceName)
return pluginapi.Unhealthy, err
}
return pluginapi.Healthy, nil
}
return pluginapi.Healthy, nil
}

// Discovers Nvidia GPU devices and sets up device access environment.
Expand Down Expand Up @@ -144,7 +181,10 @@ func (ngm *nvidiaGPUManager) ListAndWatch(emtpy *pluginapi.Empty, stream plugina
changed := true
for {
for id, dev := range ngm.devices {
state := ngm.GetDeviceState(id)
state, err := ngm.GetDeviceState(id)
if err != nil {
return err
}
if dev.Health != state {
changed = true
dev.Health = state
Expand Down
28 changes: 28 additions & 0 deletions cmd/nvidia_gpu/nvidia_gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,31 @@ func TestNvidiaGPUManager(t *testing.T) {
as.Nil(resp)
as.NotNil(err)
}

func TestConvertToMinorNumber(t *testing.T) {
as := assert.New(t)
validUint, err := convertToMinorNumber("nvidia1")
as.Equal(uint(1), validUint)
as.Nil(err)
invalidUint, err := convertToMinorNumber("dev")
as.NotEqual(uint(1), invalidUint)
as.NotNil(err)
}

func TestGetDeviceState(t *testing.T) {
// Expects a valid GPUManager to be created.
testGpuManager := NewNvidiaGPUManager()
as := assert.New(t)
as.NotNil(testGpuManager)
healthyDevice := "nvidia0"
devStatus, err := testGpuManager.GetDeviceState(healthyDevice)
as.Equal(devStatus, pluginapi.Healthy)
as.Nil(err)
// This isn't technically an unhealthy device, but the testing system
// will most likely not have 10 GPUs
unhealthyDevice := "nvidia9"
devStatus, err = testGpuManager.GetDeviceState(unhealthyDevice)
as.Equal(devStatus, pluginapi.Unhealthy)
as.NotNil(err)

}
Loading