diff --git a/src/iv/imageviewer.cpp b/src/iv/imageviewer.cpp index 12db1aa772..0e996976aa 100644 --- a/src/iv/imageviewer.cpp +++ b/src/iv/imageviewer.cpp @@ -64,7 +64,7 @@ IsSpecSrgb(const ImageSpec& spec) static const char *s_file_filters = "" "Image Files (*.bmp *.cin *.dcm *.dds *.dpx *.fits *.gif *.hdr *.ico *.iff " "*.jpg *.jpe *.jpeg *.jif *.jfif *.jfi *.jp2 *.j2k *.jxl *.exr *.png *.pbm *.pgm " - "*.ppm *.psd *.ptex *.R3D *.r3d *.rla *.sgi *.rgb *.rgba *.bw *.int *.inta *.pic *.tga " + "*.ppm *.psd *.ptex *.r3d *.nev *.rla *.sgi *.rgb *.rgba *.bw *.int *.inta *.pic *.tga " "*.tpic *.tif *.tiff *.tx *.env *.sm *.vsm *.vdb *.webp *.zfile);;" "BMP (*.bmp);;" "Cineon (*.cin);;" @@ -79,13 +79,14 @@ static const char *s_file_filters = "" "JPEG (*.jpg *.jpe *.jpeg *.jif *.jfif *.jfi);;" "JPEG-2000 (*.jp2 *.j2k);;" "JPEG XL (*.jxl);;" + "Nikon (*.nev);;" "OpenEXR (*.exr);;" "OpenVDB (*.vdb);;" "PhotoShop (*.psd);;" "Portable Network Graphics (*.png);;" "PNM / Netpbm (*.pbm *.pgm *.ppm);;" "Ptex (*.ptex);;" - "R3D (*.R3D *.r3d);;" + "R3D (*.r3d);;" "RLA (*.rla);;" "SGI (*.sgi *.rgb *.rgba *.bw *.int *.inta);;" "Softimage PIC (*.pic);;" diff --git a/src/r3d.imageio/r3dinput.cpp b/src/r3d.imageio/r3dinput.cpp index 59998f3329..d682589fc5 100644 --- a/src/r3d.imageio/r3dinput.cpp +++ b/src/r3d.imageio/r3dinput.cpp @@ -5,12 +5,19 @@ // R3D SDK can be downloaded from the following site: // https://www.red.com/download/r3d-sdk // -// The code has been tested with the version 8.5.1 installed in -// /opt/R3DSDKv8_5_1 directory and setting up the variable -// export R3DSDK_ROOT="/opt/R3DSDKv8_5_1" +// The code has been tested with the version 9.1.2 installed in +// /opt/R3DSDKv9_1_2 directory and setting up the shell variable +// export R3DSDK_ROOT="/opt/R3DSDKv9_1_2" +#define GPU +#ifdef OIIO_USE_CUDA +# define CUDA +#endif // OIIO_USE_CUDA + +#include "OpenImageIO/platform.h" #include #include +#include #include #include @@ -58,7 +65,7 @@ OIIO_PLUGIN_NAMESPACE_BEGIN -#if 0 || !defined(NDEBUG) /* allow R3D configuration debugging */ +#if 1 || !defined(NDEBUG) // allow R3D configuration debugging static bool r3d_debug = Strutil::stoi(Sysutil::getenv("OIIO_R3D_DEBUG")); # define DBG(...) \ if (r3d_debug) \ @@ -82,11 +89,17 @@ class R3dInput final : public ImageInput { const char* format_name(void) const override { return "r3d"; } int supports(string_view feature) const override { - return (feature == "ioproxy"); + if (feature == "multiimage" || feature == "appendsubimage" + || feature == "random_access" || feature == "ioproxy") + return true; + return false; } - bool open(const std::string& name, ImageSpec& spec) override; - bool open(const std::string& name, ImageSpec& spec, + bool open(const std::string& name, ImageSpec& newspec, const ImageSpec& config) override; + bool open(const std::string& name, ImageSpec& newspec) override + { + return open(name, newspec, ImageSpec()); + } bool seek_subimage(int subimage, int miplevel) override; bool read_native_scanline(int subimage, int miplevel, int y, int z, void* data) override; @@ -109,6 +122,11 @@ class R3dInput final : public ImageInput { std::unique_ptr m_config; // Saved copy of configuration spec R3DSDK::Clip* m_clip; R3DSDK::VideoDecodeJob m_job; +#ifdef GPU + bool m_gpu; + R3DSDK::DecodeStatus m_supported; + R3DSDK::AsyncDecompressJob m_async_decompress_job; +#endif // GPU unsigned char* m_image_buffer; int m_frames; int m_channels; @@ -149,7 +167,7 @@ OIIO_EXPORT const char* r3d_imageio_library_version() { // Note: SDK version can differ from the actual library loaded - return "R3D 8.5.1"; + return "R3D 9.1.2"; } OIIO_EXPORT ImageInput* @@ -158,12 +176,499 @@ r3d_input_imageio_create() return new R3dInput; } -OIIO_EXPORT const char* r3d_input_extensions[] = { "r3d", nullptr }; +OIIO_EXPORT const char* r3d_input_extensions[] = { "r3d", "nev", nullptr }; OIIO_PLUGIN_EXPORTS_END +#ifdef CUDA + +namespace { +R3DSDK::GpuDecoder* GPU_DECODER; +R3DSDK::REDCuda* RED_CUDA; + +int CUDA_DEVICE_ID = 0; +volatile bool decodeDone = false; +} // namespace + +class SimpleMemoryPool { +public: + static SimpleMemoryPool* getInstance() + { + static SimpleMemoryPool* instance = NULL; + + if (instance == NULL) { + std::unique_lock lock(guard); + if (instance == NULL) { + instance = new SimpleMemoryPool(); + } + } + return instance; + } + + static cudaError_t cudaMalloc(void** p, size_t size) + { + DBG("cudaMalloc {}\n", size); + + return getInstance()->malloc_d(p, size); + } + + static cudaError_t cudaFree(void* p) { return getInstance()->free_d(p); } + + static cudaError_t cudaMallocArray(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + size_t width, size_t height = 0, + unsigned int flags = 0) + { + DBG("cudaMallocArray {} {} {}\n", width, height, flags); + + return getInstance()->malloc_array(array, desc, width, height, flags); + } + + static cudaError_t + cudaMalloc3DArray(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + struct cudaExtent ext, unsigned int flags = 0) + { + return getInstance()->malloc_array_3d(array, desc, ext, flags); + } + + static cudaError_t cudaFreeArray(cudaArray* p) + { + getInstance()->free_array(p); + + return cudaSuccess; + } + + static cudaError_t cudaMallocHost(void** p, size_t size) + { + return getInstance()->malloc_h(p, size); + } + + static cudaError_t cudaHostAlloc(void** p, size_t size, unsigned int flags) + { + return getInstance()->hostAlloc_h(p, size, flags); + } + + static cudaError_t cudaFreeHost(void* p) + { + getInstance()->free_h(p); + + return cudaSuccess; + } + +private: + static std::mutex guard; + + cudaError_t malloc_d(void** p, size_t size) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *p = _device.findBlock(size, device); + + if (*p == NULL) { + result = ::cudaMalloc(p, size); + if (result != cudaSuccess) { + std::cerr << "Memory allocation of " << size + << " bytes failed: " << result << "\n"; + _device.sweep(); + _array.sweep(); + result = ::cudaMalloc(p, size); + } + if (result == cudaSuccess) + _device.addBlock(*p, size, device); + } + return result; + } + + cudaError_t free_d(void* p) + { + _device.releaseBlock(p); + return cudaSuccess; + } + + cudaError_t malloc_array(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + size_t width, size_t height = 0, + unsigned int flags = 0) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *array = (cudaArray*)_array.findBlock(width, height, 0, *desc, device); + + if (*array == NULL) { + result = ::cudaMallocArray(array, desc, width, height, flags); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _device.sweep(); + _array.sweep(); + result = ::cudaMallocArray(array, desc, width, height, flags); + } + if (result == cudaSuccess) + _array.addBlock(*array, width, height, 0, *desc, device); + } + return result; + } + + cudaError_t malloc_array_3d(struct cudaArray** array, + const struct cudaChannelFormatDesc* desc, + const struct cudaExtent& ext, + unsigned int flags = 0) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *array = (cudaArray*)_array.findBlock(ext.width, ext.height, ext.depth, + *desc, device); + + if (*array == NULL) { + result = ::cudaMalloc3DArray(array, desc, ext, flags); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _device.sweep(); + _array.sweep(); + result = ::cudaMalloc3DArray(array, desc, ext, flags); + } + if (result == cudaSuccess) + _array.addBlock(*array, ext.width, ext.height, ext.depth, *desc, + device); + } + return result; + } + + void free_array(void* p) { _array.releaseBlock(p); } + + cudaError_t malloc_h(void** p, size_t size) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *p = _host.findBlock(size, device); + + if (*p == NULL) { + result = ::cudaMallocHost(p, size); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _host.sweep(); + result = ::cudaMallocHost(p, size); + } + if (result == cudaSuccess) + _host.addBlock(*p, size, device); + } + return result; + } + + void free_h(void* p) + { + if (!_host.releaseBlock(p)) { + _hostAlloc.releaseBlock(p); + } + } + + cudaError_t hostAlloc_h(void** p, size_t size, unsigned int flags) + { + int device = 0; + cudaGetDevice(&device); + cudaError_t result = cudaSuccess; + *p = _hostAlloc.findBlock(size, device); + + if (*p == NULL) { + result = ::cudaHostAlloc(p, size, flags); + if (result != cudaSuccess) { + DBG("Memory allocation failed: {}\n", static_cast(result)); + _hostAlloc.sweep(); + result = ::cudaHostAlloc(p, size, flags); + } + if (result == cudaSuccess) + _hostAlloc.addBlock(*p, size, device); + } + return result; + } + + struct BLOCK { + void* ptr; + size_t size; + int device; + }; + + struct ARRAY { + void* ptr; + size_t width; + size_t height; + size_t depth; + cudaChannelFormatDesc desc; + int device; + }; + + class Pool { + public: + void addBlock(void* ptr, size_t size, int device) + { + std::unique_lock lock(_guard); + + _inUse[ptr] = { ptr, size, device }; + } + + void* findBlock(size_t size, int device) + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + if (i->size == size && i->device == device) { + void* p = i->ptr; + _inUse[p] = *i; + _free.erase(i); + return p; + } + } + return NULL; + } + + bool releaseBlock(void* ptr) + { + std::unique_lock lock(_guard); + + auto i = _inUse.find(ptr); + + if (i != _inUse.end()) { + _free.push_back(i->second); + _inUse.erase(i); + return true; + } + return false; + } + + void sweep() + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + ::cudaFree(i->ptr); + } + _free.clear(); + } + + private: + std::map _inUse; + std::vector _free; + std::mutex _guard; + }; + + class ArrayPool { + public: + void addBlock(void* ptr, size_t width, size_t height, size_t depth, + const cudaChannelFormatDesc& desc, int device) + { + std::unique_lock lock(_guard); + + _inUse[ptr] = { ptr, width, height, depth, desc, device }; + } + + void* findBlock(size_t width, size_t height, size_t depth, + const cudaChannelFormatDesc& desc, int device) + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + if (i->width == width && i->height == height + && i->depth == depth && i->desc.x == desc.x + && i->desc.y == desc.y && i->desc.z == desc.z + && i->desc.w == desc.w && i->desc.f == desc.f + && i->device == device) { + void* p = i->ptr; + _inUse[p] = *i; + _free.erase(i); + return p; + } + } + return NULL; + } + + bool releaseBlock(void* ptr) + { + std::unique_lock lock(_guard); + + auto i = _inUse.find(ptr); + + if (i != _inUse.end()) { + _free.push_back(i->second); + + _inUse.erase(i); + + return true; + } + return false; + } + + void sweep() + { + std::unique_lock lock(_guard); + + for (auto i = _free.begin(); i < _free.end(); ++i) { + ::cudaFree(i->ptr); + } + _free.clear(); + } + + private: + std::map _inUse; + std::vector _free; + std::mutex _guard; + }; + + Pool _device; + Pool _host; + Pool _hostAlloc; + ArrayPool _array; +}; + +std::mutex SimpleMemoryPool::guard; + + + +namespace { +R3DSDK::DebayerCudaJob* +DebayerAllocate(const R3DSDK::AsyncDecompressJob* job, + R3DSDK::ImageProcessingSettings* imageProcessingSettings, + R3DSDK::VideoPixelType pixelType) +{ + //allocate the debayer job + R3DSDK::DebayerCudaJob* data = RED_CUDA->createDebayerJob(); + + data->raw_host_mem = job->OutputBuffer; + data->mode = job->Mode; + data->imageProcessingSettings = imageProcessingSettings; + data->pixelType = pixelType; + + //create raw buffer on the CUDA device + cudaError_t err = SimpleMemoryPool::cudaMalloc(&(data->raw_device_mem), + job->OutputBufferSize); + + if (err != cudaSuccess) { + DBG("Failed to allocate raw frame on GPU: {}\n", static_cast(err)); + RED_CUDA->releaseDebayerJob(data); + return NULL; + } + + data->output_device_mem_size = R3DSDK::DebayerCudaJob::ResultFrameSize( + data); + DBG("data->output_device_mem_size = {}\n", data->output_device_mem_size); + + //YOU MUST specify an existing buffer for the result image + //Set DebayerCudaJob::output_device_mem_size >= result_buffer_size + //and a pointer to the device buffer in DebayerCudaJob::output_device_mem + err = SimpleMemoryPool::cudaMalloc(&(data->output_device_mem), + data->output_device_mem_size); + + if (err != cudaSuccess) { + DBG("Failed to allocate result frame on card {}\n", + static_cast(err)); + SimpleMemoryPool::cudaFree(data->raw_device_mem); + RED_CUDA->releaseDebayerJob(data); + return NULL; + } + + return data; +} + + + +void +DebayerFree(R3DSDK::DebayerCudaJob* job) +{ + SimpleMemoryPool::cudaFree(job->raw_device_mem); + SimpleMemoryPool::cudaFree(job->output_device_mem); + RED_CUDA->releaseDebayerJob(job); +} + + + +template class ConcurrentQueue { +private: + std::mutex QUEUE_MUTEX; + std::condition_variable QUEUE_CV; + std::list QUEUE; + +public: + void push(T* job) + { + std::unique_lock lck(QUEUE_MUTEX); + QUEUE.push_back(job); + QUEUE_CV.notify_all(); + } + + void pop(T*& job) + { + std::unique_lock lck(QUEUE_MUTEX); + + while (QUEUE.size() == 0) + QUEUE_CV.wait(lck); + + job = QUEUE.front(); + QUEUE.pop_front(); + } + + size_t size() const { return QUEUE.size(); } +}; + + + +void +CPU_callback(R3DSDK::AsyncDecompressJob* item, + R3DSDK::DecodeStatus decodeStatus) +{ + // DBG("CPU_callback()\n"); + Strutil::print("CPU_callback()\n"); + decodeDone = true; +} + + + +R3DSDK::REDCuda* +OpenCUDA(int& deviceId) +{ + //setup Cuda for the current thread + cudaDeviceProp deviceProp; + cudaError_t err = cudaChooseDevice(&deviceId, &deviceProp); + if (err != cudaSuccess) { + DBG("Failed to move raw frame to card {}\n", static_cast(err)); + return NULL; + } + + err = cudaSetDevice(deviceId); + if (err != cudaSuccess) { + DBG("Failed to move raw frame to card {}\n", static_cast(err)); + return NULL; + } + + //SETUP YOUR CUDA API FUNCTION POINTERS + R3DSDK::EXT_CUDA_API api; + api.cudaFree = SimpleMemoryPool::cudaFree; + api.cudaFreeArray = SimpleMemoryPool::cudaFreeArray; + api.cudaFreeHost = SimpleMemoryPool::cudaFreeHost; + api.cudaFreeMipmappedArray = ::cudaFreeMipmappedArray; + api.cudaHostAlloc = SimpleMemoryPool::cudaHostAlloc; + api.cudaMalloc = SimpleMemoryPool::cudaMalloc; + api.cudaMalloc3D = ::cudaMalloc3D; + api.cudaMalloc3DArray = SimpleMemoryPool::cudaMalloc3DArray; + api.cudaMallocArray = SimpleMemoryPool::cudaMallocArray; + api.cudaMallocHost = SimpleMemoryPool::cudaMallocHost; + api.cudaMallocMipmappedArray = ::cudaMallocMipmappedArray; + api.cudaMallocPitch = ::cudaMallocPitch; + + + //CREATE THE REDCuda CLASS + return new R3DSDK::REDCuda(api); +} +} //end anonymous namespace + +#endif // CUDA + + + void R3dInput::initialize() { @@ -172,19 +677,30 @@ R3dInput::initialize() std::string library_path = Sysutil::getenv("OIIO_R3D_LIBRARY_PATH", #if defined(__linux__) - "/opt/R3DSDKv8_5_1/Redistributable/linux" + "/opt/R3DSDKv9_1_2/Redistributable/linux" #elif defined(__APPLE__) - "/Library/R3DSDKv8_5_1/Redistributable/mac" + "/Library/R3DSDKv9_1_2/Redistributable/mac" #elif defined(__WINDOWS__) - "C:\\R3DSDKv8_5_1\\Redistributable\\win" + "C:\\R3DSDKv9_1_2\\Redistributable\\win" #else # error "Unknown OS" #endif ); // initialize SDK - // R3DSDK::InitializeStatus init_status = R3DSDK::InitializeSdk(".", OPTION_RED_CUDA); + + unsigned int optional_components = +#ifdef CUDA + OPTION_RED_CUDA; +#elif defined(OpenCL) + OPTION_RED_OPENCL; +#elif defined(Metal) + OPTION_RED_METAL; +#else + OPTION_RED_NONE; +#endif + R3DSDK::InitializeStatus init_status - = R3DSDK::InitializeSdk(library_path.c_str(), OPTION_RED_NONE); + = R3DSDK::InitializeSdk(library_path.c_str(), optional_components); if (init_status != R3DSDK::ISInitializeOK) { R3DSDK::FinalizeSdk(); DBG("Failed to load R3DSDK Library\n"); @@ -192,15 +708,16 @@ R3dInput::initialize() } DBG("SDK VERSION: {}\n", R3DSDK::GetSdkVersion()); -#ifdef GPU +#ifdef CUDA // open CUDA device RED_CUDA = OpenCUDA(CUDA_DEVICE_ID); if (RED_CUDA == NULL) { - R3DSDK::FinalizeSdk(); DBG("Failed to initialize CUDA\n"); } -#endif // GPU + + m_gpu = true; +#endif // CUDA } @@ -213,15 +730,6 @@ R3dInput::open(const std::string& name, ImageSpec& newspec, ioproxy_retrieve_from_config(config); m_config.reset(new ImageSpec(config)); // save config spec - return open(name, newspec); -} - - - -bool -R3dInput::open(const std::string& name, ImageSpec& newspec) -{ - DBG("R3dInput::open(name, newspec)\n"); m_filename = name; @@ -241,9 +749,43 @@ R3dInput::open(const std::string& name, ImageSpec& newspec) DBG("Loaded {}\n", m_filename); + int hint = 0; + if (m_config) { + hint = m_config->get_int_attribute("oiio:hint"); + DBG("hint = {}\n", hint); + } + + R3DSDK::VideoDecodeMode mode = R3DSDK::DECODE_FULL_RES_PREMIUM; + int scale = 1; + + switch (hint) { + case 0: + mode = R3DSDK::DECODE_FULL_RES_PREMIUM; + scale = 1; + break; + case 1: + mode = R3DSDK::DECODE_HALF_RES_GOOD; + scale = 2; + break; + case 2: + mode = R3DSDK::DECODE_QUARTER_RES_GOOD; + scale = 4; + break; + case 3: + mode = R3DSDK::DECODE_EIGHT_RES_GOOD; + scale = 8; + break; + case 4: + mode = R3DSDK::DECODE_SIXTEENTH_RES_GOOD; + scale = 16; + break; + } + // calculate how much ouput memory we're going to need - size_t width = m_clip->Width(); - size_t height = m_clip->Height(); + size_t width = m_clip->Width() / scale; + size_t height = m_clip->Height() / scale; + + DBG("{}×{}\n", width, height); m_channels = 3; @@ -256,37 +798,97 @@ R3dInput::open(const std::string& name, ImageSpec& newspec) m_fps = m_clip->VideoAudioFramerate(); + DBG("File list count {}\n", m_clip->FileListCount()); + // three channels (RGB) in 16-bit (2 bytes) requires this much memory: size_t memNeeded = m_channels * width * height * sizeof(uint16_t); // alloc this memory 16-byte aligned m_image_buffer = static_cast(aligned_malloc(memNeeded, 16)); - if (m_image_buffer == NULL) { + if (m_image_buffer == nullptr) { DBG("Failed to allocate {} bytes of memory for output image\n", static_cast(memNeeded)); return false; } - // letting the decoder know how big the buffer is (we do that here - // since AlignedMalloc below will overwrite the value in this - m_job.OutputBufferSize = memNeeded; +#ifdef GPU + if (m_gpu) { + // open GPU decoder + GPU_DECODER = new R3DSDK::GpuDecoder(); + GPU_DECODER->Open(); + + m_supported = GPU_DECODER->DecodeSupportedForClip(*m_clip); + } + + if (m_supported == R3DSDK::DSDecodeOK) { + m_async_decompress_job.Clip = m_clip; + + m_async_decompress_job.Mode = mode; + + // letting the decoder know how big the buffer is + m_async_decompress_job.OutputBufferSize + = R3DSDK::AsyncDecoder::GetSizeBufferNeeded(m_async_decompress_job); + + DBG("OutputBufferSize = {}\n", m_async_decompress_job.OutputBufferSize); + + m_async_decompress_job.OutputBuffer = static_cast( + aligned_malloc(m_async_decompress_job.OutputBufferSize, 16)); + + // Interleaved RGB decoding in 16-bits per pixel + // m_async_decompress_job.PixelType + // = R3DSDK::PixelType_16Bit_RGB_Interleaved; - m_job.Mode = R3DSDK::DECODE_FULL_RES_PREMIUM; + // m_async_decompress_job.ImageProcessing = NULL; + // m_async_decompress_job.HdrProcessing = NULL; + // m_async_decompress_job.Callback = CPU_callback; + } else +#endif // GPU + { + // letting the decoder know how big the buffer is + m_job.OutputBufferSize = memNeeded; + + m_job.Mode = mode; - // store the image here - m_job.OutputBuffer = m_image_buffer; + // store the image here + m_job.OutputBuffer = m_image_buffer; - // Interleaved RGB decoding in 16-bits per pixel - m_job.PixelType = R3DSDK::PixelType_16Bit_RGB_Interleaved; - m_job.BytesPerRow = m_channels * width * sizeof(uint16_t); + // Interleaved RGB decoding in 16-bits per pixel + m_job.PixelType = R3DSDK::PixelType_16Bit_RGB_Interleaved; + + m_job.ImageProcessing = NULL; + m_job.HdrProcessing = NULL; + } m_spec = ImageSpec(width, height, m_channels, TypeDesc::UINT16); - m_spec.attribute("FramesPerSecond", TypeFloat, &m_fps); + + int frame_rate_numerator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_FRAMERATE_NUMERATOR); + int frame_rate_denominator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_FRAMERATE_DENOMINATOR); + int frame_rate[2] = { frame_rate_numerator, frame_rate_denominator }; + + bool record_frame_rate_exists = m_clip->MetadataExists( + R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR); + if (record_frame_rate_exists) { + int record_frame_rate_numerator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_RECORD_FRAMERATE_NUMERATOR); + int record_frame_rate_denominator = m_clip->MetadataItemAsInt( + R3DSDK::RMD_RECORD_FRAMERATE_DENOMINATOR); + int record_frame_rate[2] = { record_frame_rate_numerator, + record_frame_rate_denominator }; + m_spec.attribute("FramesPerSecond", TypeRational, &record_frame_rate); + } else { + m_spec.attribute("FramesPerSecond", TypeRational, &frame_rate); + } + m_spec.attribute("oiio:Movie", true); m_spec.attribute("oiio:subimages", int(m_frames)); m_spec.attribute("oiio:BitsPerSample", 16); +#ifdef GPU + m_spec.attribute("oiio:GPU", m_gpu); +#endif // GPU newspec = m_spec; m_next_scanline = 0; @@ -304,9 +906,125 @@ R3dInput::read_frame(int pos) seek(pos); } - R3DSDK::DecodeStatus status = m_clip->DecodeVideoFrame(pos, m_job); - if (status != R3DSDK::DSDecodeOK) { - DBG("Failed to decode frame {}\n", pos); +#ifdef GPU + if (m_gpu && (m_supported == R3DSDK::DSDecodeOK)) { + m_async_decompress_job.VideoFrameNo = pos; + m_async_decompress_job.VideoTrackNo = 0; + m_async_decompress_job.Callback = CPU_callback; + + decodeDone = false; + + int device = CUDA_DEVICE_ID; + cudaStream_t stream; + cudaError_t err; + + err = cudaStreamCreate(&stream); + if (err != cudaSuccess) { + DBG("Failed to create stream {}\n", static_cast(err)); + return; + } + + R3DSDK::DecodeStatus decode_status = GPU_DECODER->DecodeForGpuSdk( + m_async_decompress_job); + if (decode_status != R3DSDK::DSDecodeOK) { + DBG("Failed to decode frame {} with status {}\n", pos, + static_cast(decode_status)); + cudaStreamDestroy(stream); + return; + } + + while (!decodeDone) { + usleep(1000); + } + + R3DSDK::ImageProcessingSettings* ips + = new R3DSDK::ImageProcessingSettings(); + m_async_decompress_job.Clip->GetDefaultImageProcessingSettings(*ips); + + const R3DSDK::VideoPixelType pixelType + = R3DSDK::PixelType_16Bit_RGB_Interleaved; + + R3DSDK::DebayerCudaJob* debayer_cuda_job + = DebayerAllocate(&m_async_decompress_job, ips, pixelType); + if (debayer_cuda_job == nullptr) { + delete ips; + cudaStreamDestroy(stream); + return; + } + + m_async_decompress_job.PrivateData = debayer_cuda_job; + + DBG("debayer_cuda_job = {}\n", static_cast(debayer_cuda_job)); + DBG(" raw_host_mem = {}\n", + static_cast(debayer_cuda_job->raw_host_mem)); + DBG(" raw_device_mem = {}\n", + static_cast(debayer_cuda_job->raw_device_mem)); + DBG(" output_device_mem_size = {}\n", + debayer_cuda_job->output_device_mem_size); + DBG(" output_device_mem = {}\n", debayer_cuda_job->output_device_mem); + DBG(" mode = {}\n", static_cast(debayer_cuda_job->mode)); + DBG(" pixelType = {}\n", + static_cast(debayer_cuda_job->pixelType)); + + R3DSDK::REDCuda::Status status + = RED_CUDA->processAsync(device, stream, debayer_cuda_job, err); + + if (status != R3DSDK::REDCuda::Status_Ok) { + DBG("Failed to process frame, error {}\n", + static_cast(status)); + delete debayer_cuda_job->imageProcessingSettings; + debayer_cuda_job->imageProcessingSettings = NULL; + DebayerFree(debayer_cuda_job); + + cudaStreamDestroy(stream); + return; + } + + debayer_cuda_job->completeAsync(); + + size_t result_buffer_size = R3DSDK::DebayerCudaJob::ResultFrameSize( + debayer_cuda_job); + + DBG("result_buffer_size = {}\n", result_buffer_size); + + //allocate the result buffer in host memory. + if (result_buffer_size != debayer_cuda_job->output_device_mem_size) { + DBG("Result buffer size does not match expected size: Expected: {} Actual: {}\n", + result_buffer_size, debayer_cuda_job->output_device_mem_size); + } + + if (m_image_buffer != nullptr) { + //read the GPU buffer back to the host memory result buffer. - Note this is not always the optimal way to read back. (Use pinned memory in a real app) + cudaError_t err = cudaMemcpy(m_image_buffer, + debayer_cuda_job->output_device_mem, + result_buffer_size, + cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + DBG("Failed to read result frame from card {}\n", + static_cast(err)); + } else { + //ensure the read is complete. + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + DBG("Failed to finish after reading result frame from card {}\n", + static_cast(err)); + } + } + } + + delete debayer_cuda_job->imageProcessingSettings; + debayer_cuda_job->imageProcessingSettings = NULL; + DebayerFree(debayer_cuda_job); + + cudaStreamDestroy(stream); + } else +#endif // GPU + { + R3DSDK::DecodeStatus decode_status = m_clip->DecodeVideoFrame(pos, + m_job); + if (decode_status != R3DSDK::DSDecodeOK) { + DBG("Failed to decode frame {}\n", pos); + } } m_last_search_pos = pos; @@ -320,7 +1038,7 @@ R3dInput::read_frame(int pos) bool R3dInput::seek_subimage(int subimage, int miplevel) { - DBG("R3dInput::seek_subimage({}, {})\n", subimage, miplevel); + // DBG("R3dInput::seek_subimage({}, {})\n", subimage, miplevel); if (subimage < 0 || subimage >= m_nsubimages || miplevel > 0) { return false; @@ -339,7 +1057,7 @@ bool R3dInput::read_native_scanline(int subimage, int miplevel, int y, int /*z*/, void* data) { - DBG("R3dInput::read_native_scanline({}, {}, {})\n", subimage, miplevel, y); + // DBG("R3dInput::read_native_scanline({}, {}, {})\n", subimage, miplevel, y); lock_guard lock(*this); if (!seek_subimage(subimage, miplevel)) @@ -403,16 +1121,29 @@ R3dInput::fps() const bool R3dInput::close() { + lock_guard lock(*this); DBG("R3dInput::close()\n"); + DBG("m_filename = {}\n", m_filename); if (m_clip) { - delete m_clip; + // delete m_clip; m_clip = nullptr; } + if (m_image_buffer) { aligned_free(m_image_buffer); m_image_buffer = nullptr; } + +#ifdef GPU + if (m_gpu) { + if (GPU_DECODER) { + GPU_DECODER->Close(); + delete GPU_DECODER; + GPU_DECODER = nullptr; + } + } +#endif // GPU reset(); // Reset to initial state return true; }