diff --git a/Makefile b/Makefile
index d8dc0c3c0c..237a0cd8ea 100644
--- a/Makefile
+++ b/Makefile
@@ -239,6 +239,10 @@ ifneq (${BUILD_OIIOUTIL_ONLY},)
 MY_CMAKE_FLAGS += -DBUILD_OIIOUTIL_ONLY:BOOL=${BUILD_OIIOUTIL_ONLY}
 endif
 
+ifneq (${USE_CUDA},)
+MY_CMAKE_FLAGS += -DUSE_CUDA:BOOL=${USE_CUDA}
+endif
+
 ifdef DEBUG
 MY_CMAKE_FLAGS += -DCMAKE_BUILD_TYPE:STRING=Debug
 endif
@@ -510,6 +514,7 @@ help:
 	@echo "      USE_OPENCV=0             Skip anything that needs OpenCV"
 	@echo "      USE_PTEX=0               Skip anything that needs PTex"
 	@echo "      USE_FREETYPE=0           Skip anything that needs Freetype"
+	@echo "      USE_CUDA=1               Build NVIDIA CUDA support (if found)"
 	@echo "  OIIO build-time options:"
 	@echo "      INSTALL_PREFIX=path      Set installation prefix (default: ./${INSTALL_PREFIX_BRIEF})"
 	@echo "      NAMESPACE=name           Override namespace base name (default: OpenImageIO)"
diff --git a/site/spi/Makefile-bits-arnold b/site/spi/Makefile-bits-arnold
index 8b7fa11f41..86bad3f4ee 100644
--- a/site/spi/Makefile-bits-arnold
+++ b/site/spi/Makefile-bits-arnold
@@ -120,6 +120,11 @@ ifeq (${SP_OS}, rhel7)
 	    -DOPENEXR_CUSTOM_LIB_DIR=/usr/lib64/OpenEXR2
     endif
 
+    # CUDA customizations
+    MY_CMAKE_FLAGS += \
+	-DCUDA_TOOLKIT_ROOT_DIR=/net/soft_scratch/apps/arnold/tools/nvidia/cuda9.1 \
+	-DCUDA_HOST_COMPILER=/shots/spi/home/lib/arnold/rhel7/llvm_4.0_final/bin/clang++
+
     MY_CMAKE_FLAGS += \
 	-DOCIO_PATH=${OCIO_PATH} \
 	-DFIELD3D_HOME=${FIELD3D_HOME} \
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index 33c28aac59..618807693e 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -22,6 +22,8 @@ set (CLANG_TIDY_CHECKS "-*" CACHE STRING "clang-tidy checks to perform")
 set (CLANG_TIDY_ARGS "" CACHE STRING "clang-tidy args")
 option (CLANG_TIDY_FIX "Have clang-tidy fix source" OFF)
 set (GLIBCXX_USE_CXX11_ABI "" CACHE STRING "For gcc, use the new C++11 library ABI (0|1)")
+option (USE_CUDA "Use CUDA if found" OFF)
+set (CUDA_TARGET_ARCH "sm_30" CACHE STRING "CUDA GPU architecture (e.g. sm_35)")
 
 # Figure out which compiler we're using
 if (CMAKE_COMPILER_IS_GNUCC)
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 05e9bcb0a9..711549b83a 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -4,6 +4,7 @@
 # When not in VERBOSE mode, try to make things as quiet as possible
 if (NOT VERBOSE)
     set (Boost_FIND_QUIETLY true)
+    set (CUDA_FIND_QUIETLY true)
     set (DCMTK_FIND_QUIETLY true)
     set (FFmpeg_FIND_QUIETLY true)
     set (Field3D_FIND_QUIETLY true)
@@ -624,4 +625,27 @@ endmacro()
 
 
 ###########################################################################
+if (USE_CUDA)
+    if (NOT CUDA_TOOLKIT_ROOT_DIR AND NOT $ENV{CUDA_TOOLKIT_ROOT_DIR} STREQUAL "")
+        set (CUDA_TOOLKIT_ROOT_DIR $ENV{CUDA_TOOLKIT_ROOT_DIR})
+    endif ()
+    if (NOT CUDA_FIND_QUIETLY)
+        message (STATUS "CUDA_TOOLKIT_ROOT_DIR = ${CUDA_TOOLKIT_ROOT_DIR}")
+    endif ()
+    set (CUDA_PROPAGATE_HOST_FLAGS ON)
+    set (CUDA_VERBOSE_BUILD ${VERBOSE})
+    find_package (CUDA 7.0 REQUIRED)
+    list (APPEND CUDA_NVCC_FLAGS ${CSTD_FLAGS} -expt-relaxed-constexpr)
+    set (CUDA_INCLUDE_DIR ${CUDA_TOOLKIT_ROOT_DIR}/include)
+    message (STATUS "CUDA version = ${CUDA_VERSION}")
+    if (NOT CUDA_FIND_QUIETLY)
+        message (STATUS "CUDA includes  = ${CUDA_INCLUDE_DIR}")
+        message (STATUS "CUDA libraries = ${CUDA_LIBRARIES}")
+        message (STATUS "CUDA host compiler = ${CUDA_HOST_COMPILER}")
+        message (STATUS "CUDA nvcc flags = ${CUDA_NVCC_FLAGS}")
+    endif ()
+endif ()
+
+# end Cuda
+###########################################################################
 
diff --git a/src/include/OpenImageIO/imagebuf.h b/src/include/OpenImageIO/imagebuf.h
index c35b6c7092..a758e35418 100644
--- a/src/include/OpenImageIO/imagebuf.h
+++ b/src/include/OpenImageIO/imagebuf.h
@@ -183,6 +183,9 @@ class OIIO_API ImageBuf {
     /// Which type of storage is being used for the pixels?
     IBStorage storage () const;
 
+    /// Is the pixel memory of this ImageBuf visible to Cuda?
+    bool cuda_storage () const;
+
     /// Is this ImageBuf object initialized?
     bool initialized () const;
 
diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index e4eac911e9..20c03c6873 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -314,8 +314,10 @@
 // OIIO_HOSTDEVICE is used to supply the function decorators needed when
 // compiling for CUDA devices.
 #ifdef __CUDACC__
+#  define OIIO_HOST __host__
 #  define OIIO_HOSTDEVICE __host__ __device__
 #else
+#  define OIIO_HOST
 #  define OIIO_HOSTDEVICE
 #endif
 
diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt
index 695b9bdcd2..7ba863f33d 100644
--- a/src/libOpenImageIO/CMakeLists.txt
+++ b/src/libOpenImageIO/CMakeLists.txt
@@ -4,6 +4,12 @@ endif ()
 file (TO_NATIVE_PATH "${PLUGIN_SEARCH_PATH}" PLUGIN_SEARCH_PATH_NATIVE)
 configure_file (imageio_pvt.h.in "${CMAKE_BINARY_DIR}/include/imageio_pvt.h" @ONLY)
 
+if (BUILDSTATIC)
+    set (OIIO_LIB_TYPE "STATIC")
+else ()
+    set (OIIO_LIB_TYPE "SHARED")
+endif ()
+
 file (GLOB libOpenImageIO_hdrs ../include/OpenImageIO/*.h)
 
 if (NOT USE_EXTERNAL_PUGIXML)
@@ -47,6 +53,7 @@ set (libOpenImageIO_srcs
                           imageoutput.cpp iptc.cpp xmp.cpp
                           color_ocio.cpp
                           maketexture.cpp
+                          imageio_cuda.cpp
                           ../libutil/argparse.cpp
                           ../libutil/benchmark.cpp
                           ../libutil/errorhandler.cpp 
@@ -73,6 +80,13 @@ set (libOpenImageIO_srcs
                           ${libOpenImageIO_hdrs}
                          )
 
+#set (cuda_using_srcs imagebufalgo_addsub.cpp)
+
+if (USE_CUDA)
+    file (GLOB gpu_source_files "*.cu")
+    message (STATUS "Extra cuda files: ${gpu_source_files}")
+    list (APPEND libOpenImageIO_srcs ${gpu_source_files})
+endif ()
 
 # If the 'EMBEDPLUGINS' option is set, we want to compile the source for
 # all the plugins into libOpenImageIO.
@@ -100,10 +114,12 @@ endif ()
 source_group ("libutil"    REGULAR_EXPRESSION ".+/libutil/.+")
 source_group ("libtexture" REGULAR_EXPRESSION ".+/libtexture/.+")
 
-if (BUILDSTATIC)
-    add_library (OpenImageIO STATIC ${libOpenImageIO_srcs})
+if (USE_CUDA)
+    add_definitions ("-DOIIO_USE_CUDA=1")
+    cuda_add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs}
+                      OPTIONS -arch ${CUDA_TARGET_ARCH})
 else ()
-    add_library (OpenImageIO SHARED ${libOpenImageIO_srcs})
+    add_library (OpenImageIO ${OIIO_LIB_TYPE} ${libOpenImageIO_srcs})
 endif ()
 
 # if (SANITIZE AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
@@ -206,7 +222,9 @@ if (EXTRA_DSO_LINK_ARGS)
     set_target_properties (OpenImageIO PROPERTIES LINK_FLAGS ${EXTRA_DSO_LINK_ARGS})
 endif()
 
-oiio_install_targets (OpenImageIO)
+oiio_install_targets (OpenImageIO 
+#${OIIO_CUDA_LIB}
+)
 
 
 # Testing
diff --git a/src/libOpenImageIO/imagebuf.cpp b/src/libOpenImageIO/imagebuf.cpp
index 3102d1c4f6..9f6725cc93 100644
--- a/src/libOpenImageIO/imagebuf.cpp
+++ b/src/libOpenImageIO/imagebuf.cpp
@@ -144,6 +144,7 @@ class ImageBufImpl {
     void append_error (const std::string& message) const;
 
     ImageBuf::IBStorage storage () const { return m_storage; }
+    bool cuda_storage () const { return m_cuda_storage; }
 
     TypeDesc pixeltype () const {
         validate_spec ();
@@ -194,6 +195,7 @@ class ImageBufImpl {
                               m_current_subimage, m_current_miplevel);
     }
 
+    // Make sure the pixels are ready to read with an iterator.
     bool validate_pixels () const {
         if (m_pixels_valid)
             return true;
@@ -259,6 +261,7 @@ class ImageBufImpl {
     mutable spin_mutex m_valid_mutex;
     mutable bool m_spec_valid;   ///< Is the spec valid
     mutable bool m_pixels_valid; ///< Image is valid
+    mutable bool m_cuda_storage = false; ///< Is the pixel memory visible to Cuda?
     bool m_badfile;              ///< File not found
     float m_pixelaspect;         ///< Pixel aspect ratio of the image
     size_t m_pixel_bytes;
@@ -373,6 +376,7 @@ ImageBufImpl::ImageBufImpl (const ImageBufImpl &src)
         // Source had the image fully in memory (no cache)
         if (m_storage == ImageBuf::APPBUFFER) {
             // Source just wrapped the client app's pixels, we do the same
+            m_allocated_size = src.m_localpixels ? src.spec().image_bytes() : 0;
             m_localpixels = src.m_localpixels;
         } else {
             // We own our pixels -- copy from source
@@ -498,7 +502,28 @@ ImageBufImpl::new_pixels (size_t size, const void *data)
     if (m_allocated_size)
         free_pixels();
     m_allocated_size = size;
-    m_pixels.reset (size ? new char [size] : nullptr);
+    m_cuda_storage = false;
+    if (size) {
+#ifdef OIIO_USE_CUDA
+        if (OIIO::get_int_attribute("cuda") && m_spec.format == TypeFloat) {
+            char *cudaptr = (char *)OIIO::pvt::cuda_malloc (m_allocated_size);
+            if (cudaptr) {
+                OIIO::debug ("IB Cuda allocated %p\n", (void*)cudaptr);
+                m_pixels.reset (cudaptr);
+                m_cuda_storage = true;
+            }
+            else {
+                OIIO::debug ("Requested cudaMallocManaged of %s FAILED\n",
+                             m_allocated_size);
+            }
+        }
+#endif
+        if (! m_pixels) {
+            m_pixels.reset (new char [size]);
+        }
+    } else {
+        m_pixels.reset ();
+    }
     IB_local_mem_current += m_allocated_size;
     if (data && size)
         memcpy (m_pixels.get(), data, size);
@@ -515,12 +540,21 @@ void
 ImageBufImpl::free_pixels ()
 {
     IB_local_mem_current -= m_allocated_size;
+#if OIIO_USE_CUDA
+    if (m_cuda_storage) {
+        OIIO::debug ("IB Cuda free %p\n", (void*)m_pixels.get());
+        OIIO::pvt::cuda_free (m_pixels.release());
+        m_cuda_storage = false;
+    }
+#endif
     m_pixels.reset ();
     if (m_allocated_size && pvt::oiio_print_debug > 1)
         OIIO::debug ("IB freed %d MB, global IB memory now %d MB\n",
                      m_allocated_size>>20, IB_local_mem_current>>20);
     m_allocated_size = 0;
     m_storage = ImageBuf::UNINITIALIZED;
+    m_localpixels = nullptr;
+    m_pixels_valid = false;
 }
 
 
@@ -577,6 +611,14 @@ ImageBuf::storage () const
 
 
 
+bool
+ImageBuf::cuda_storage () const
+{
+    return impl()->cuda_storage ();
+}
+
+
+
 void
 ImageBufImpl::clear ()
 {
@@ -588,10 +630,8 @@ ImageBufImpl::clear ()
     m_current_miplevel = -1;
     m_spec = ImageSpec ();
     m_nativespec = ImageSpec ();
-    m_pixels.reset ();
-    m_localpixels = NULL;
+    release_pixels ();
     m_spec_valid = false;
-    m_pixels_valid = false;
     m_badfile = false;
     m_pixelaspect = 1;
     m_pixel_bytes = 0;
@@ -700,8 +740,6 @@ ImageBufImpl::realloc ()
     m_channel_bytes = m_spec.format.size();
     m_blackpixel.resize (round_to_multiple (m_pixel_bytes, OIIO_SIMD_MAX_SIZE_BYTES), 0);
     // NB make it big enough for SSE
-    if (m_allocated_size)
-        m_pixels_valid = true;
     if (m_spec.deep) {
         m_deepdata.init (m_spec);
         m_storage = ImageBuf::LOCALBUFFER;
@@ -2440,5 +2478,4 @@ ImageBuf::retile (int x, int y, int z, ImageCache::Tile* &tile,
 }
 
 
-
 OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index d3f1e703b0..2eccaaeea0 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -44,6 +44,7 @@
 #include <OpenImageIO/deepdata.h>
 #include <OpenImageIO/dassert.h>
 #include "imageio_pvt.h"
+#include "imagebufalgo_cuda.h"
 
 
 
@@ -121,6 +122,15 @@ ImageBufAlgo::add (ImageBuf &dst, Image_or_Const A_, Image_or_Const B_,
             return false;
         ROI origroi = roi;
         roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));
+
+#ifdef OIIO_USE_CUDA
+        if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
+                dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
+            return pvt::add_impl_cuda (dst, A, B, dst.roi());
+        }
+// make >/dev/null && OPENIMAGEIO_LOG_TIMES=2 oiiotool -cuda -frames 1-1 -pattern fill:topleft=0,0,0:topright=0.5,0,0:bottomleft=0,0.5,0:bottomright=0.5,0.5,0.5 1920x1080 3 -pattern fill:topleft=0.5,0,0:topright=0,0.5,0:bottomleft=0.5,0.5,0.5:bottomright=0,.5,.5 1920x1080 3 -add -o out.exr
+#endif
+
         bool ok;
         OIIO_DISPATCH_COMMON_TYPES3 (ok, "add", add_impl, dst.spec().format,
                                      A.spec().format, B.spec().format,
@@ -209,6 +219,14 @@ ImageBufAlgo::sub (ImageBuf &dst, Image_or_Const A_, Image_or_Const B_,
             return false;
         ROI origroi = roi;
         roi.chend = std::min (roi.chend, std::min (A.nchannels(), B.nchannels()));
+
+#ifdef OIIO_USE_CUDA
+        if (dst.cuda_storage() && A.cuda_storage() && B.cuda_storage() &&
+                dst.roi() == roi && A.roi() == roi && B.roi() == roi) {
+            return pvt::sub_impl_cuda (dst, A, B, dst.roi());
+        }
+#endif
+
         bool ok;
         OIIO_DISPATCH_COMMON_TYPES3 (ok, "sub", sub_impl, dst.spec().format,
                                      A.spec().format, B.spec().format,
diff --git a/src/libOpenImageIO/imagebufalgo_cuda.cu b/src/libOpenImageIO/imagebufalgo_cuda.cu
new file mode 100644
index 0000000000..a474230d3c
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_cuda.cu
@@ -0,0 +1,110 @@
+/*
+  Copyright 2018 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+#include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/timer.h>
+#include "imagebufalgo_cuda.h"
+
+
+OIIO_NAMESPACE_BEGIN
+namespace pvt {
+
+
+__global__
+void add_cuda (float *R, const float *A, const float *B, ROI roi)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    int n = int(roi.npixels());
+    int nc = roi.nchannels();
+    for (int p = index; p < n; p += stride) {
+        int i = p*nc;
+        for (int c = roi.chbegin; c < roi.chend; ++c)
+            R[i+c] = A[i+c] + B[i+c];
+    }
+}
+
+
+
+bool
+add_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B,
+               ROI roi)
+{
+    Timer timer;
+    int blockSize = 1024;
+    int numBlocks = (int(roi.npixels()) + blockSize - 1) / blockSize;
+    add_cuda<<<numBlocks, blockSize>>>((float *)R.localpixels(),
+                                       (const float *)A.localpixels(),
+                                       (const float *)B.localpixels(), roi);
+    cudaDeviceSynchronize();
+    OIIO::debug ("Running cuda ImageBufAlgo::add, %d blocks of %d: %gms\n",
+                 numBlocks, blockSize, timer()*1000.0f);
+    return true;
+}
+
+
+
+
+__global__
+void sub_cuda (float *R, const float *A, const float *B, ROI roi)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    int n = int(roi.npixels());
+    int nc = roi.nchannels();
+    for (int p = index; p < n; p += stride) {
+        int i = p*nc;
+        for (int c = roi.chbegin; c < roi.chend; ++c)
+            R[i+c] = A[i+c] - B[i+c];
+    }
+}
+
+
+
+bool
+sub_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B,
+               ROI roi)
+{
+    Timer timer;
+    int blockSize = 1024;
+    int numBlocks = (int(roi.npixels()) + blockSize - 1) / blockSize;
+    sub_cuda<<<numBlocks, blockSize>>>((float *)R.localpixels(),
+                                       (const float *)A.localpixels(),
+                                       (const float *)B.localpixels(), roi);
+    cudaDeviceSynchronize();
+    OIIO::debug ("Running cuda ImageBufAlgo::sub, %d blocks of %d: %gms\n",
+                 numBlocks, blockSize, timer()*1000.0f);
+    return true;
+}
+
+
+}  // end namespace pvt
+OIIO_NAMESPACE_END
+
diff --git a/src/libOpenImageIO/imagebufalgo_cuda.h b/src/libOpenImageIO/imagebufalgo_cuda.h
new file mode 100644
index 0000000000..e071b76bef
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_cuda.h
@@ -0,0 +1,45 @@
+/*
+  Copyright 2018 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+#include <OpenImageIO/oiioversion.h>
+
+
+
+OIIO_NAMESPACE_BEGIN
+namespace pvt {
+
+
+bool add_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B, ROI roi);
+bool sub_impl_cuda (ImageBuf &R, const ImageBuf &A, const ImageBuf &B, ROI roi);
+
+
+}  // end namespace pvt
+OIIO_NAMESPACE_END
+
diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index cbd08fbe7c..f2fdf1d393 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -80,6 +80,7 @@ int oiio_print_debug (oiio_debug_env ? atoi(oiio_debug_env) : 0);
 int oiio_print_debug (oiio_debug_env ? atoi(oiio_debug_env) : 1);
 #endif
 int oiio_log_times = Strutil::from_string<int>(Sysutil::getenv("OPENIMAGEIO_LOG_TIMES"));
+atomic_int use_cuda (Strutil::from_string<int>(Sysutil::getenv("OPENIMAGEIO_CUDA")));
 }
 
 using namespace pvt;
@@ -282,6 +283,11 @@ attribute (string_view name, TypeDesc type, const void *val)
         default_thread_pool()->resize (ot-1);
         return true;
     }
+    if (name == "cuda" && type == TypeDesc::TypeInt) {
+        use_cuda = (*(const int *)val);
+        return true;
+    }
+
     spin_lock lock (attrib_mutex);
     if (name == "read_chunk" && type == TypeInt) {
         oiio_read_chunk = *(const int *)val;
@@ -323,6 +329,11 @@ getattribute (string_view name, TypeDesc type, void *val)
         *(int *)val = oiio_threads;
         return true;
     }
+    if (name == "cuda" && type == TypeDesc::TypeInt) {
+        *(int *)val = openimageio_cuda();
+        return true;
+    }
+
     spin_lock lock (attrib_mutex);
     if (name == "read_chunk" && type == TypeInt) {
         *(int *)val = oiio_read_chunk;
diff --git a/src/libOpenImageIO/imageio_cuda.cpp b/src/libOpenImageIO/imageio_cuda.cpp
new file mode 100644
index 0000000000..413097da29
--- /dev/null
+++ b/src/libOpenImageIO/imageio_cuda.cpp
@@ -0,0 +1,174 @@
+/*
+  Copyright 2018 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <mutex>
+
+#ifdef OIIO_USE_CUDA
+// #include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#include <OpenImageIO/imageio.h>
+#include <OpenImageIO/strutil.h>
+#include <OpenImageIO/thread.h>
+#include "imageio_pvt.h"
+
+
+OIIO_NAMESPACE_BEGIN
+
+// Global private data
+namespace pvt {
+
+spin_mutex cuda_mutex;
+bool cuda_supported = false;
+std::string cuda_device_name;
+int cuda_driver_version = 0;
+int cuda_runtime_version = 0;
+int cuda_compatibility = 0;
+size_t cuda_total_memory = 0;
+
+
+
+#ifdef OIIO_USE_CUDA
+
+// This will output the proper CUDA error strings in the event that a
+// CUDA host call returns an error
+#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline bool __checkCudaErrors(cudaError_t err, const char *file, const int line)
+{
+    if (cudaSuccess != err) {
+        Strutil::fprintf (stderr, "Cuda error %d (%s) at %s:%d\n",
+                          (int)err, cudaGetErrorString(err), file, line);
+    }
+    return true;
+    return (err == cudaSuccess);
+}
+
+
+
+static void
+initialize_cuda ()
+{
+    // Environment OPENIMAGEIO_CUDA=0 trumps everything else, turns off
+    // Cuda functionality.
+    const char *env = getenv ("OPENIMAGEIO_CUDA");
+    if (env && strtol(env,NULL,10) == 0)
+        return;
+
+    // if (! checkCudaErrors (cuInit (0)))
+    //     return;
+
+    // Get number of devices supporting CUDA
+    int deviceCount = 0;
+    if (! checkCudaErrors (cudaGetDeviceCount(&deviceCount))) {
+        return;
+    }
+
+    OIIO::debug ("Number of Cuda devices: %d\n", deviceCount);
+#if 0
+    for (int dev = 0; dev < deviceCount; ++dev) {
+        CUdevice device;
+        cudaGetDevice (&device, dev);
+        cudaSetDevice(dev);
+        cudaDeviceProp deviceProp;
+        cudaGetDeviceProperties(&deviceProp, dev);
+        cuda_device_name = deviceProp.name;
+        cuDriverGetVersion (&cuda_driver_version);
+        cudaRuntimeGetVersion (&cuda_runtime_version);
+        cuda_compatibility = 100 * deviceProp.major + deviceProp.minor;
+        cuda_total_memory = deviceProp.totalGlobalMem;
+        OIIO::debug ("Cuda device \"%s\": driver %s, runtime %s, Cuda compat %s\n",
+                     cuda_device_name, cuda_driver_version,
+                     cuda_runtime_version, cuda_compatibility);
+        OIIO::debug (" total mem %g MB\n", cuda_total_memory/(1024.0*1024.0));
+        break;  // only inventory the first Cuda device. FIXME? 
+    }
+#endif
+    cuda_supported = true;
+}
+
+#endif /* defined(OIIO_USE_CUDA) */
+
+
+
+bool
+openimageio_cuda ()
+{
+    if (! use_cuda)
+        return false;
+#ifdef OIIO_USE_CUDA
+    static std::once_flag cuda_initialized;
+    std::call_once (cuda_initialized, initialize_cuda);
+#endif
+    return cuda_supported;
+}
+
+
+struct cuda_force_initializer {
+    cuda_force_initializer() { (void) openimageio_cuda(); }
+};
+cuda_force_initializer init;
+
+
+
+void* cuda_malloc (size_t size)
+{
+#ifdef OIIO_USE_CUDA
+    if (use_cuda) {
+        char *cudaptr = nullptr;
+        checkCudaErrors (cudaMallocManaged (&cudaptr, size));
+        cudaDeviceSynchronize();
+        return cudaptr;
+    }
+#endif
+    return malloc (size);
+}
+
+
+
+void cuda_free (void *mem)
+{
+#ifdef OIIO_USE_CUDA
+    if (use_cuda) {
+        cudaDeviceSynchronize();
+        checkCudaErrors (cudaFree (mem));
+        return;
+    }
+#endif
+    return free (mem);
+}
+
+
+} // end namespace pvt
+
+OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imageio_pvt.h.in b/src/libOpenImageIO/imageio_pvt.h.in
index 4d2e7f1cee..9b3a565f3d 100644
--- a/src/libOpenImageIO/imageio_pvt.h.in
+++ b/src/libOpenImageIO/imageio_pvt.h.in
@@ -60,6 +60,14 @@ extern std::string library_list;
 extern int oiio_print_debug;
 extern int oiio_log_times;
 
+extern atomic_int use_cuda;
+extern bool cuda_supported;
+extern std::string cuda_device_name;
+extern int cuda_driver_version;
+extern int cuda_runtime_version;
+extern int cuda_compatibility;
+extern size_t cuda_total_memory;
+
 
 // For internal use - use error() below for a nicer interface.
 void seterror (string_view message);
@@ -75,6 +83,13 @@ inline void error (string_view fmt, const Args&... args) {
 // imageio_mutex is held.  For internal use only.
 void catalog_all_plugins (std::string searchpath);
 
+// Is Cuda available to OpenImageIO?
+bool openimageio_cuda ();
+
+/// Allocate unified Cuda/CPU memory
+void* cuda_malloc (size_t size);
+void cuda_free (void *mem);
+
 /// Given the format, set the default quantization range.
 void get_default_quantize (TypeDesc format,
                            long long &quant_min, long long &quant_max);
@@ -136,6 +151,7 @@ public:
     }
     void stop () { m_timer.stop(); }
     void rename (string_view name) { m_name = name; }
+    Timer& timer () { return m_timer; }
 private:
     Timer m_timer;
     std::string m_name;
diff --git a/src/oiiotool/oiiotool.cpp b/src/oiiotool/oiiotool.cpp
index 3f80d55b8d..813acb7c4f 100644
--- a/src/oiiotool/oiiotool.cpp
+++ b/src/oiiotool/oiiotool.cpp
@@ -482,6 +482,27 @@ unset_autopremult (int argc, const char *argv[])
 }
 
 
+static int
+enable_cuda (int argc, const char *argv[])
+{
+    ASSERT (argc == 1);
+    OIIO::attribute ("cuda", 1);
+    int r = OIIO::get_int_attribute ("cuda");   // force initialization
+    if (ot.debug)
+        std::cout << "Enable_cuda: " << r << "\n";
+    return 0;
+}
+
+
+static int
+disable_cuda (int argc, const char *argv[])
+{
+    ASSERT (argc == 1);
+    OIIO::attribute ("cuda", 0);
+    return 0;
+}
+
+
 
 static int
 action_label (int argc, const char *argv[])
@@ -5152,6 +5173,8 @@ getargs (int argc, char *argv[])
                 "-a", &ot.allsubimages, "Do operations on all subimages/miplevels",
                 "--debug", &ot.debug, "Debug mode",
                 "--runstats", &ot.runstats, "Print runtime statistics",
+                "--cuda %@", &enable_cuda, NULL, "Use Cuda if available",
+                "--nocuda %@", &disable_cuda, NULL, "Don't use Cuda, even if available",
                 "--info %@", set_printinfo, NULL, "Print resolution and basic info on all inputs, detailed metadata if -v is also used (options: format=xml:verbose=1)",
                 "--echo %@ %s", do_echo, NULL, "Echo message to console (options: newline=0)",
                 "--metamatch %s", &ot.printinfo_metamatch,