diff --git "a/docs/TinyInfiniTrain \344\275\234\344\270\232\346\212\245\345\221\212.md" "b/docs/TinyInfiniTrain \344\275\234\344\270\232\346\212\245\345\221\212.md" index bc23852..8bc72a8 100644 --- "a/docs/TinyInfiniTrain \344\275\234\344\270\232\346\212\245\345\221\212.md" +++ "b/docs/TinyInfiniTrain \344\275\234\344\270\232\346\212\245\345\221\212.md" @@ -1,10 +1,31 @@ # TinyInfiniTrain 作业报告 ## 一、test 通过截图 +![alt text](image-1.png) +![alt text](image-2.png) ## 二、作业步骤 > 将代码填入下面代码块中指定位置,并详细描述完成该作业的解决思路和遇到的问题。 +补充: +1. 创建构建目录并配置(启用 CUDA) +```bash +mkdir -p build/Release +cd build/Release +cmake -DBUILD_TEST=ON -DBUILD_TESTING=ON -DUSE_CUDA=ON ../.. +``` +2. 编译: +```bash +make -j$(nproc) +``` +3. 运行全部测试并打印失败: +```bash +ctest --output-on-failure +``` +4. 运行单测: +```bash +./test_matmul_cuda +``` ### 作业一:autograd机制调用Neg kernel的实现 @@ -20,6 +41,11 @@ std::vector> Neg::Forward(const std::vectorGetDevice().Type(); + auto kernel = Dispatcher::Instance().GetKernel({device, "NegForward"}); + return {kernel.Call>(input)}; } std::vector> Neg::Backward(const std::vector> &grad_outputs) { @@ -27,11 +53,17 @@ std::vector> Neg::Backward(const std::vectorGetDevice().Type(); + auto kernel = Dispatcher::Instance().GetKernel({device,"NegBackward"}); + return {kernel.Call>(grad_output)}; } ``` #### 解决思路 - +1.在 elementwise.cc 这一层,我们不关心具体的数学计算是如何在 CUDA 上实现的。只负责逻辑分发。 +2.利用单例模式 Dispatcher::Instance(),通过 {device, "KernelName"} 作为唯一索引(Key)检索对应的函数指针。 #### 遇到问题 @@ -54,6 +86,51 @@ std::vector> Neg::Backward(const std::vectorDims(); + const auto &b_dims = other->Dims(); + + CHECK_GE(a_dims.size(), 2); + CHECK_GE(b_dims.size(), 2); + + const int64_t m = a_dims[a_dims.size() - 2]; + const int64_t k = a_dims[a_dims.size() - 1]; + const int64_t n = b_dims[b_dims.size() - 1]; + + std::vector batch_dims(a_dims.begin(), a_dims.end() - 2); + std::vector b_batch_dims(b_dims.begin(), b_dims.end() - 2); + CHECK_EQ(batch_dims.size(), b_batch_dims.size()); + for (size_t i = 0; i < batch_dims.size(); ++i) CHECK_EQ(batch_dims[i], b_batch_dims[i]); + std::vector out_dims = batch_dims; + out_dims.push_back(m); + out_dims.push_back(n); + auto output = std::make_shared(out_dims, DataType::kFLOAT32); + int64_t batch_count = 1; + for (auto d : batch_dims) batch_count *= d; + + const float *a_ptr = static_cast(input->DataPtr()); + const float *b_ptr = static_cast(other->DataPtr()); + float *out_ptr = static_cast(output->DataPtr()); + + const int64_t a_block = m * k; + const int64_t b_block = k * n; + const int64_t out_block = m * n; + + for (int64_t batch = 0; batch < batch_count; ++batch) { + const float *a_block_ptr = a_ptr + batch * a_block; + const float *b_block_ptr = b_ptr + batch * b_block; + float *out_block_ptr = out_ptr + batch * out_block; + + Eigen::Map> A( + reinterpret_cast(a_block_ptr), m, k); + Eigen::Map> B( + reinterpret_cast(b_block_ptr), k, n); + Eigen::Map> C( + reinterpret_cast(out_block_ptr), m, n); + + C.noalias() = A * B; + } + + return output; } std::tuple, std::shared_ptr> @@ -63,6 +140,61 @@ std::vector> Neg::Backward(const std::vectorDims(); + const auto &b_dims = other->Dims(); + + CHECK_GE(a_dims.size(), 2); + CHECK_GE(b_dims.size(), 2); + + const int64_t m = a_dims[a_dims.size() - 2]; + const int64_t k = a_dims[a_dims.size() - 1]; + const int64_t n = b_dims[b_dims.size() - 1]; + + std::vector batch_dims(a_dims.begin(), a_dims.end() - 2); + std::vector b_batch_dims(b_dims.begin(), b_dims.end() - 2); + CHECK_EQ(batch_dims.size(), b_batch_dims.size()); + for (size_t i = 0; i < batch_dims.size(); ++i) CHECK_EQ(batch_dims[i], b_batch_dims[i]); + + auto grad_input = std::make_shared(a_dims, DataType::kFLOAT32); + auto grad_other = std::make_shared(b_dims, DataType::kFLOAT32); + + int64_t batch_count = 1; + for (auto d : batch_dims) batch_count *= d; + + const float *a_ptr = static_cast(input->DataPtr()); + const float *b_ptr = static_cast(other->DataPtr()); + const float *g_ptr = static_cast(grad_output->DataPtr()); + float *gi_ptr = static_cast(grad_input->DataPtr()); + float *go_ptr = static_cast(grad_other->DataPtr()); + + const int64_t a_block = m * k; + const int64_t b_block = k * n; + const int64_t g_block = m * n; + + for (int64_t batch = 0; batch < batch_count; ++batch) { + const float *a_block_ptr = a_ptr + batch * a_block; + const float *b_block_ptr = b_ptr + batch * b_block; + const float *g_block_ptr = g_ptr + batch * g_block; + float *gi_block_ptr = gi_ptr + batch * a_block; + float *go_block_ptr = go_ptr + batch * b_block; + + Eigen::Map> A( + reinterpret_cast(a_block_ptr), m, k); + Eigen::Map> B( + reinterpret_cast(b_block_ptr), k, n); + Eigen::Map> G( + reinterpret_cast(g_block_ptr), m, n); + + Eigen::Map> GI( + reinterpret_cast(gi_block_ptr), m, k); + Eigen::Map> GO( + reinterpret_cast(go_block_ptr), k, n); + + GI.noalias() = G * B.transpose(); + GO.noalias() = A.transpose() * G; + } + + return {grad_input, grad_other}; } ``` @@ -78,6 +210,61 @@ std::vector> Neg::Backward(const std::vectorDims(); + const auto &other_dims = other->Dims(); + + const int64_t m = input_dims[input_dims.size() - 2]; + const int64_t k = input_dims[input_dims.size() - 1]; + const int64_t n = other_dims[other_dims.size() - 1]; + + int64_t batch_count = 1; + std::vector leading_dims; + if (input_dims.size() > 2) { + leading_dims.assign(input_dims.begin(), input_dims.end() - 2); + for (auto d : leading_dims) batch_count *= d; + } + + std::vector output_dims; + if (!leading_dims.empty()) { + output_dims = leading_dims; + } + output_dims.push_back(m); + output_dims.push_back(n); + auto output = std::make_shared(output_dims, DataType::kFLOAT32, input->GetDevice()); + + const float alpha = 1.0f; + const float beta = 0.0f; + + cublasHandle_t handle; + CUBLAS_CHECK(cublasCreate(&handle)); + + // if no batch, do single sgemm; otherwise do per-batch sgemm + if (batch_count == 1) { + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, + static_cast(other->DataPtr()), n, + static_cast(input->DataPtr()), k, &beta, + static_cast(output->DataPtr()), n)); + } else { + // assume contiguous layout: batch-major then row-major for each matrix as used elsewhere + // input slice size = m * k, other slice size = k * n, output slice size = m * n + const int64_t in_stride = m * k; + const int64_t other_stride = k * n; + const int64_t out_stride = m * n; + const float *in_base = static_cast(input->DataPtr()); + const float *other_base = static_cast(other->DataPtr()); + float *out_base = static_cast(output->DataPtr()); + for (int64_t b = 0; b < batch_count; ++b) { + const float *in_ptr = in_base + b * in_stride; + const float *other_ptr = other_base + b * other_stride; + float *out_ptr = out_base + b * out_stride; + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, + other_ptr, n, in_ptr, k, &beta, out_ptr, n)); + } + } + + CUBLAS_CHECK(cublasDestroy(handle)); + return output; + } std::tuple, std::shared_ptr> @@ -87,15 +274,43 @@ std::vector> Neg::Backward(const std::vectorDims(); + const auto &b_dims = other->Dims(); + + const int64_t m = a_dims[0]; + const int64_t k = a_dims[1]; + const int64_t n = b_dims[1]; + + const auto grad_input = std::make_shared(a_dims,DataType::kFLOAT32,grad_output->GetDevice()); + const auto grad_other = std::make_shared(b_dims,DataType::kFLOAT32,grad_output->GetDevice()); + + const float alpha = 1.0f; + const float beta = 0.0f; + + cublasHandle_t handle; + CUBLAS_CHECK(cublasCreate(&handle)); + CUBLAS_CHECK(cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_N,k,m,n,&alpha, + static_cast(other->DataPtr()),n, + static_cast(grad_output->DataPtr()),n, + &beta, + static_cast(grad_input->DataPtr()),k)); + + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, n, k, m, &alpha, + static_cast(grad_output->DataPtr()), n, + static_cast(input->DataPtr()), k, + &beta, + static_cast(grad_other->DataPtr()), n)); + + CUBLAS_CHECK(cublasDestroy(handle)); + return {grad_input, grad_other}; } ``` #### 解决思路 - - +利用矩阵乘法的性质 $C^T = (A \times B)^T = B^T \times A^T$。在代码中,通过交换 $A$ 和 $B$ 的位置,并调整维度参数(如 $n, m, k$),巧妙地在不进行物理转置的情况下完成计算。 #### 遇到问题 - +cuBLAS 默认是列优先存储,而C++/PyTorch 通常是行优先,同时还要熟悉cublas矩阵乘的参数代表的含义, ### 作业三:实现Adam优化器 @@ -116,6 +331,22 @@ void AdamAccumulateGrad(const std::shared_ptr &grad, const std::shared_p // TODO:实现Adam优化器的梯度累积和参数更新 // REF: // =================================== 作业 =================================== + const auto n = grad->NumElements(); + const float *g_ptr = static_cast(grad->DataPtr()); + float *m_ptr = static_cast(m->DataPtr()); + float *v_ptr = static_cast(v->DataPtr()); + float *p_ptr = static_cast(param->DataPtr()); + + const float bias_correction1 = 1.0f - std::pow(beta1,t); + const float bias_correction2 = 1.0f - std::pow(beta2,t); + + for(size_t i = 0;i &grad, const std::shared_p 代码位置:infini_train/src/kernels/cuda/accumulate_grad.cu ```c++ +__global__ void AdamAccumulateGradKernel(const float *grad_ptr,float *p_ptr,float *m_ptr,float *v_ptr,float learning_rate, + float bias_correction1, float bias_correction2,float eps, int64_t t,size_t n,float beta1,float beta2){ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if(idx < n) + { + m_ptr[idx] = beta1 * m_ptr[idx] + (1 - beta1) * grad_ptr[idx]; + v_ptr[idx] = beta2 * v_ptr[idx] + (1 - beta2) * grad_ptr[idx] * grad_ptr[idx]; + float m_hat = m_ptr[idx] / bias_correction1; + float v_hat = v_ptr[idx] / bias_correction2; + p_ptr[idx] -= learning_rate * m_hat / (sqrtf(v_hat) + eps); + } + } + + void AdamAccumulateGrad(const std::shared_ptr &grad, const std::shared_ptr ¶m, const std::shared_ptr &m, const std::shared_ptr &v, float learning_rate, float beta1, float beta2, float eps, int64_t t) { // =================================== 作业 =================================== // TODO:实现Adam优化器的梯度累积和参数更新 - // REF: + // REF: // =================================== 作业 =================================== + const auto n = grad->NumElements(); + const float *grad_ptr = static_cast(grad->DataPtr()); + float *p_ptr = static_cast(param->DataPtr()); + float *m_ptr = static_cast(m->DataPtr()); + float *v_ptr = static_cast(v->DataPtr()); + int threads_per_block = 256; + int num_blocks = (n + threads_per_block - 1) / threads_per_block; + float bias_correction1 = 1.0f - powf(beta1, t); + float bias_correction2 = 1.0f - powf(beta2, t); + AdamAccumulateGradKernel<<>>(grad_ptr,p_ptr,m_ptr,v_ptr,learning_rate,bias_correction1,bias_correction2,eps,t,n,beta1,beta2); + } ``` #### 解决思路 - +adam原理: +![alt text](image-3.png) #### 遇到问题 - +先把所有线程都需要计算相同矫正偏差算出来,从而减少每个线程都去再算一遍 ### 作业四:实现Tensor基础操作 @@ -160,6 +417,25 @@ std::shared_ptr Tensor::Flatten(int64_t start, int64_t end) { // TODO:实现张量扁平化操作,将指定维度范围[start, end]内的所有维度合并为一个维度 // HINT: // =================================== 作业 =================================== + const int64_t rank = static_cast(new_shape.size()); + if (start < 0) start += rank; + if (end < 0) end += rank; + CHECK_GE(start, 0); + CHECK_LT(start, rank); + CHECK_GE(end, 0); + CHECK_LT(end, rank); + CHECK_GE(end, start); + + int64_t new_size = 1; + for (int64_t i = start; i <= end; ++i) { + new_size *= new_shape[i]; + } + + + new_shape.erase(new_shape.begin() + start, new_shape.begin() + end + 1); + new_shape.insert(new_shape.begin() + start, new_size); + + return Contiguous()->View(new_shape); } ``` @@ -178,6 +454,37 @@ void Tensor::Backward(std::shared_ptr gradient, bool retain_graph, bool // 功能描述:1. 计算当前张量对叶子节点的梯度 2. 支持多输出场景的梯度累加 // HINT: // =================================== 作业 =================================== + std::shared_ptr grad = gradient; + if (!grad) { + //传入的loss必须是一个标量 + if (NumElements() != 1) { + LOG(FATAL) << "grad must be specified for non-scalar tensor"; + } + grad = std::make_shared(dims_, dtype_, GetDevice()); + grad->Fill(1.0f); + } + + + if (grad->GetDevice().Type() != GetDevice().Type()) { + grad = std::make_shared(grad->To(GetDevice())); + } + + CHECK_EQ(grad->NumElements(), NumElements()) << "gradient must have the same number of elements as tensor"; + + if (is_leaf()) { + if (!grad_) { + auto self = const_cast(this); + self->grad_ = std::make_shared(dims_, dtype_, GetDevice()); + self->grad_->Fill(0.0f); + } + auto kernel = Dispatcher::Instance().GetKernel({GetDevice().Type(), "AccumulateGrad"}); + kernel.Call(grad, 1.0f, grad_); + return; + } + + if (grad_fn_) { + grad_fn_->BackwardPartial(grad, output_idx_); + } } ``` @@ -204,6 +511,10 @@ template RetT Call(ArgsT... args) const { // 功能描述:将存储的函数指针转换为指定类型并调用 // HINT: // =================================== 作业 =================================== + using FuncT = RetT (*)(ArgsT...); + auto f = reinterpret_cast(func_ptr_); + CHECK(f != nullptr) << "Attempt to call null kernel function"; + return f(args...); } template void Register(const KeyT &key, FuncT &&kernel) { @@ -211,6 +522,9 @@ template void Register(const KeyT &key, FuncT &&kernel) { // TODO:实现kernel注册机制 // 功能描述:将kernel函数与设备类型、名称绑定 // =================================== 作业 =================================== + CHECK(!key_to_kernel_map_.contains(key)) + << "Kernel already registered: " << key.second << " on device: " << static_cast(key.first); + key_to_kernel_map_.emplace(key, KernelFunction(std::forward(kernel))); } #define REGISTER_KERNEL(device, kernel_name, kernel_func) \ @@ -218,9 +532,14 @@ template void Register(const KeyT &key, FuncT &&kernel) { // TODO:实现自动注册宏 // 功能描述:在全局静态区注册kernel,避免显式初始化代码 // =================================== 作业 =================================== + #define REGISTER_KERNEL(device, kernel_name, kernel_func) \ + static const int REGISTER_KERNEL_CONCAT(_reg_##kernel_name##_, __LINE__) = \ + ((void)::infini_train::Dispatcher::Instance().Register( \ + ::infini_train::Dispatcher::KeyT(device, #kernel_name), kernel_func), 0); ``` #### 解决思路 +自动微分的本质是构建一个有向无环图 (DAG)的拓扑拍寻,并沿着图进行反向遍历。 @@ -252,13 +571,71 @@ TinyShakespeareFile ReadTinyShakespeareFile(const std::string &path, size_t sequ | magic(4B) | version(4B) | num_toks(4B) | reserved(1012B) | token数据 | ---------------------------------------------------------------------------------- =================================== 作业 =================================== */ + TinyShakespeareFile out; + if (!std::filesystem::exists(path)) { + LOG(FATAL) << "File not found: " << path; + } + + std::ifstream ifs(path, std::ios::binary); + CHECK(ifs.is_open()) << "Failed to open file: " << path; + + const size_t header_bytes = 1024; + auto header = ReadSeveralBytesFromIfstream(header_bytes, &ifs); + + const auto version = BytesToType(header, 4); + CHECK(kTypeMap.find(static_cast(version)) != kTypeMap.end()) + << "Unsupported tiny shakespeare version: " << version; + const auto type = kTypeMap.at(static_cast(version)); + out.type = type; + + const auto num_toks = BytesToType(header, 8); + CHECK_GT(num_toks, 0U); + + const size_t orig_type_size = kTypeToSize.at(type); + + const size_t token_bytes = static_cast(num_toks) * orig_type_size; + std::vector tokens_bytes(token_bytes); + ifs.read(reinterpret_cast(tokens_bytes.data()), token_bytes); + CHECK_EQ(static_cast(ifs.gcount()), token_bytes) << "Failed to read token data"; + + const size_t sample_stride = sequence_length + 1; // each sample holds seq_len + 1 tokens + const size_t num_samples = num_toks / sample_stride; + CHECK_GT(num_samples, 0U) << "Not enough tokens for given sequence_length"; + + std::vector storage(num_samples * sample_stride); + + for (size_t i = 0; i < num_toks; ++i) { + int64_t val = 0; + if (orig_type_size == 2) { + val = static_cast(BytesToType(tokens_bytes, i * orig_type_size)); + } else if (orig_type_size == 4) { + val = static_cast(BytesToType(tokens_bytes, i * orig_type_size)); + } else { + LOG(FATAL) << "Unsupported token size: " << orig_type_size; + } + storage[i] = val; + } + + const std::vector backing_dims = {static_cast(num_samples * sample_stride)}; + out.tensor = infini_train::Tensor(backing_dims, infini_train::DataType::kINT64); + memcpy(out.tensor.DataPtr(), storage.data(), storage.size() * sizeof(int64_t)); + + out.dims = {static_cast(num_samples), static_cast(sequence_length)}; + + return out; } + TinyShakespeareDataset::TinyShakespeareDataset(const std::string &filepath, size_t sequence_length) { // =================================== 作业 =================================== // TODO:初始化数据集实例 // HINT: 调用ReadTinyShakespeareFile加载数据文件 // =================================== 作业 =================================== + text_file_ = ReadTinyShakespeareFile(filepath, sequence_length); + sequence_length_ = sequence_length; + const size_t sample_stride = sequence_length + 1; + sequence_size_in_bytes_ = sample_stride * sizeof(int64_t); + num_samples_ = static_cast(text_file_.dims[0]); } ``` @@ -277,6 +654,77 @@ Tokenizer::Tokenizer(const std::string &filepath) { | magic(4B) | version(4B) | vocab_size(4B) | reserved(1012B) | token词表数据 | ---------------------------------------------------------------------------------- ===================================== 作业 ===================================== */ + if (!std::filesystem::exists(filepath)) { + LOG(FATAL) << "Tokenizer file not found: " << filepath; + } + + std::ifstream ifs(filepath, std::ios::binary); + CHECK(ifs.is_open()) << "Failed to open tokenizer file: " << filepath; + + const size_t header_bytes = 1024; + auto header = ReadSeveralBytesFromIfstream(header_bytes, &ifs); + + const auto file_magic = BytesToType(header, 0); + const auto version = BytesToType(header, 4); + const auto vocab_size = BytesToType(header, 8); + + magic_number_ = version; + vocab_size_ = vocab_size; + + const auto file_size = std::filesystem::file_size(filepath); + const size_t remaining = (file_size > header_bytes) ? static_cast(file_size - header_bytes) : 0; + CHECK_GT(remaining, 0u) << "Empty vocab table in tokenizer file"; + + std::vector table_bytes(remaining); + ifs.read(reinterpret_cast(table_bytes.data()), remaining); + + size_t pos = 0; + token_table_.reserve(vocab_size_); + + bool parsed = false; + if (remaining >= 4) { + const uint32_t first_len = BytesToType(table_bytes, 0); + if (first_len > 0 && first_len < remaining) { + pos = 0; + try { + for (uint32_t i = 0; i < vocab_size_ && pos + 4 <= remaining; ++i) { + uint32_t len = BytesToType(table_bytes, pos); + pos += 4; + CHECK_LE(pos + len, remaining) << "Tokenizer entry length overflow"; + std::string token(reinterpret_cast(&table_bytes[pos]), len); + token_table_.push_back(std::move(token)); + pos += len; + } + if (token_table_.size() == vocab_size_) parsed = true; + } catch (...) { + parsed = false; + } + } + } + + if (!parsed) { + token_table_.clear(); + std::string cur; + for (size_t i = 0; i < table_bytes.size() && token_table_.size() < vocab_size_; ++i) { + if (table_bytes[i] == '\0') { + token_table_.push_back(cur); + cur.clear(); + } else { + cur.push_back(static_cast(table_bytes[i])); + } + } + if (!cur.empty() && token_table_.size() < vocab_size_) token_table_.push_back(cur); + } + + if (token_table_.size() > vocab_size_) token_table_.resize(vocab_size_); + while (token_table_.size() < vocab_size_) token_table_.push_back(""); + + auto it = kEotMap.find(magic_number_); + if (it != kEotMap.end()) { + eot_token_ = it->second; + } else { + eot_token_ = kGpt2Eot; // default + } } ``` @@ -286,6 +734,8 @@ std::string Tokenizer::Decode(uint32_t token_id) const { TODO:实现token_id到文本的转换 功能描述:根据token_id返回对应的文本片段 ===================================== 作业 ===================================== */ + if (token_id >= token_table_.size()) return std::string(); + return token_table_[token_id]; } ``` @@ -299,6 +749,40 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz TODO:实现单步文本生成逻辑 HINT:调用model.Forward推理获取logits,根据推理结果进行随机采样,调用Decode获取文本结果 ===================================== 作业 ===================================== */ + auto x = std::make_shared(x_tensor.To(device)); + + auto outputs = model.Forward({x}); + auto logits = outputs[0]; + + auto logits_cpu = logits->To(infini_train::Device(infini_train::DeviceType::kCPU, 0)); + const auto &ldims = logits_cpu.Dims(); + CHECK_EQ(ldims.size(), 3); + const int B = static_cast(ldims[0]); + const int T = static_cast(ldims[1]); + const int V = static_cast(ldims[2]); + + const float *logits_ptr = static_cast(logits_cpu.DataPtr()); + + std::vector probs(V); + for (int b = 0; b < B; ++b) { + const float *row = logits_ptr + static_cast(b * T + t) * V; + float maxv = row[0]; + for (int i = 1; i < V; ++i) maxv = std::max(maxv, row[i]); + double sum = 0.0; + for (int i = 0; i < V; ++i) { + probs[i] = std::exp(row[i] - maxv); + sum += probs[i]; + } + for (int i = 0; i < V; ++i) probs[i] = static_cast(probs[i] / sum); + + float coin = RandomF32(rng_state); + int sampled = SampleMult(probs.data(), V, coin); + + x_buff[static_cast(b) * sequence_length + t] = static_cast(sampled); + + std::cout << Decode(static_cast(sampled)); + } + } std::cout << std::endl; } @@ -309,4 +793,4 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz #### 遇到问题 - +这个整个流程还是难度还是有点大,讲课内容和这个没什么关系 diff --git a/docs/image-1.png b/docs/image-1.png new file mode 100644 index 0000000..3125ae8 Binary files /dev/null and b/docs/image-1.png differ diff --git a/docs/image-2.png b/docs/image-2.png new file mode 100644 index 0000000..6ee40eb Binary files /dev/null and b/docs/image-2.png differ diff --git a/docs/image-3.png b/docs/image-3.png new file mode 100644 index 0000000..c48efbb Binary files /dev/null and b/docs/image-3.png differ diff --git a/docs/image.png b/docs/image.png new file mode 100644 index 0000000..fd9cf75 Binary files /dev/null and b/docs/image.png differ diff --git a/example/common/tiny_shakespeare_dataset.cc b/example/common/tiny_shakespeare_dataset.cc index 3bc5f1b..d9e575d 100644 --- a/example/common/tiny_shakespeare_dataset.cc +++ b/example/common/tiny_shakespeare_dataset.cc @@ -61,6 +61,59 @@ TinyShakespeareFile ReadTinyShakespeareFile(const std::string &path, size_t sequ | magic(4B) | version(4B) | num_toks(4B) | reserved(1012B) | token数据 | ---------------------------------------------------------------------------------- =================================== 作业 =================================== */ + + TinyShakespeareFile out; + if (!std::filesystem::exists(path)) { + LOG(FATAL) << "File not found: " << path; + } + + std::ifstream ifs(path, std::ios::binary); + CHECK(ifs.is_open()) << "Failed to open file: " << path; + + const size_t header_bytes = 1024; + auto header = ReadSeveralBytesFromIfstream(header_bytes, &ifs); + + const auto version = BytesToType(header, 4); + CHECK(kTypeMap.find(static_cast(version)) != kTypeMap.end()) + << "Unsupported tiny shakespeare version: " << version; + const auto type = kTypeMap.at(static_cast(version)); + out.type = type; + + const auto num_toks = BytesToType(header, 8); + CHECK_GT(num_toks, 0U); + + const size_t orig_type_size = kTypeToSize.at(type); + + const size_t token_bytes = static_cast(num_toks) * orig_type_size; + std::vector tokens_bytes(token_bytes); + ifs.read(reinterpret_cast(tokens_bytes.data()), token_bytes); + CHECK_EQ(static_cast(ifs.gcount()), token_bytes) << "Failed to read token data"; + + const size_t sample_stride = sequence_length + 1; // each sample holds seq_len + 1 tokens + const size_t num_samples = num_toks / sample_stride; + CHECK_GT(num_samples, 0U) << "Not enough tokens for given sequence_length"; + + std::vector storage(num_samples * sample_stride); + + for (size_t i = 0; i < num_toks; ++i) { + int64_t val = 0; + if (orig_type_size == 2) { + val = static_cast(BytesToType(tokens_bytes, i * orig_type_size)); + } else if (orig_type_size == 4) { + val = static_cast(BytesToType(tokens_bytes, i * orig_type_size)); + } else { + LOG(FATAL) << "Unsupported token size: " << orig_type_size; + } + storage[i] = val; + } + + const std::vector backing_dims = {static_cast(num_samples * sample_stride)}; + out.tensor = infini_train::Tensor(backing_dims, infini_train::DataType::kINT64); + memcpy(out.tensor.DataPtr(), storage.data(), storage.size() * sizeof(int64_t)); + + out.dims = {static_cast(num_samples), static_cast(sequence_length)}; + + return out; } } // namespace @@ -69,6 +122,11 @@ TinyShakespeareDataset::TinyShakespeareDataset(const std::string &filepath, size // TODO:初始化数据集实例 // HINT: 调用ReadTinyShakespeareFile加载数据文件 // =================================== 作业 =================================== + text_file_ = ReadTinyShakespeareFile(filepath, sequence_length); + sequence_length_ = sequence_length; + const size_t sample_stride = sequence_length + 1; + sequence_size_in_bytes_ = sample_stride * sizeof(int64_t); + num_samples_ = static_cast(text_file_.dims[0]); } std::pair, std::shared_ptr> diff --git a/example/common/tiny_shakespeare_dataset.h b/example/common/tiny_shakespeare_dataset.h index 8e061dd..cd28243 100644 --- a/example/common/tiny_shakespeare_dataset.h +++ b/example/common/tiny_shakespeare_dataset.h @@ -37,7 +37,7 @@ class TinyShakespeareDataset : public infini_train::Dataset { private: TinyShakespeareFile text_file_; - const size_t sequence_length_ = 0; - const size_t sequence_size_in_bytes_ = 0; - const size_t num_samples_ = 0; + size_t sequence_length_ = 0; + size_t sequence_size_in_bytes_ = 0; + size_t num_samples_ = 0; }; diff --git a/example/common/tokenizer.cc b/example/common/tokenizer.cc index 23b9537..81e2f92 100644 --- a/example/common/tokenizer.cc +++ b/example/common/tokenizer.cc @@ -78,6 +78,78 @@ Tokenizer::Tokenizer(const std::string &filepath) { | magic(4B) | version(4B) | vocab_size(4B) | reserved(1012B) | token词表数据 | ---------------------------------------------------------------------------------- ===================================== 作业 ===================================== */ + + if (!std::filesystem::exists(filepath)) { + LOG(FATAL) << "Tokenizer file not found: " << filepath; + } + + std::ifstream ifs(filepath, std::ios::binary); + CHECK(ifs.is_open()) << "Failed to open tokenizer file: " << filepath; + + const size_t header_bytes = 1024; + auto header = ReadSeveralBytesFromIfstream(header_bytes, &ifs); + + const auto file_magic = BytesToType(header, 0); + const auto version = BytesToType(header, 4); + const auto vocab_size = BytesToType(header, 8); + + magic_number_ = version; + vocab_size_ = vocab_size; + + const auto file_size = std::filesystem::file_size(filepath); + const size_t remaining = (file_size > header_bytes) ? static_cast(file_size - header_bytes) : 0; + CHECK_GT(remaining, 0u) << "Empty vocab table in tokenizer file"; + + std::vector table_bytes(remaining); + ifs.read(reinterpret_cast(table_bytes.data()), remaining); + + size_t pos = 0; + token_table_.reserve(vocab_size_); + + bool parsed = false; + if (remaining >= 4) { + const uint32_t first_len = BytesToType(table_bytes, 0); + if (first_len > 0 && first_len < remaining) { + pos = 0; + try { + for (uint32_t i = 0; i < vocab_size_ && pos + 4 <= remaining; ++i) { + uint32_t len = BytesToType(table_bytes, pos); + pos += 4; + CHECK_LE(pos + len, remaining) << "Tokenizer entry length overflow"; + std::string token(reinterpret_cast(&table_bytes[pos]), len); + token_table_.push_back(std::move(token)); + pos += len; + } + if (token_table_.size() == vocab_size_) parsed = true; + } catch (...) { + parsed = false; + } + } + } + + if (!parsed) { + token_table_.clear(); + std::string cur; + for (size_t i = 0; i < table_bytes.size() && token_table_.size() < vocab_size_; ++i) { + if (table_bytes[i] == '\0') { + token_table_.push_back(cur); + cur.clear(); + } else { + cur.push_back(static_cast(table_bytes[i])); + } + } + if (!cur.empty() && token_table_.size() < vocab_size_) token_table_.push_back(cur); + } + + if (token_table_.size() > vocab_size_) token_table_.resize(vocab_size_); + while (token_table_.size() < vocab_size_) token_table_.push_back(""); + + auto it = kEotMap.find(magic_number_); + if (it != kEotMap.end()) { + eot_token_ = it->second; + } else { + eot_token_ = kGpt2Eot; // default + } } std::string Tokenizer::Decode(uint32_t token_id) const { @@ -85,7 +157,8 @@ std::string Tokenizer::Decode(uint32_t token_id) const { TODO:实现token_id到文本的转换 功能描述:根据token_id返回对应的文本片段 ===================================== 作业 ===================================== */ - return ""; + if (token_id >= token_table_.size()) return std::string(); + return token_table_[token_id]; } void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_size, uint32_t sequence_length, @@ -103,14 +176,54 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz for (int i = 0; i < prompt_len; ++i) { x_buff[i] = prompt[i]; } std::cout << "The meaning of life is"; - auto x = std::make_shared(x_tensor.To(device)); - uint64_t kRngState = kRngState; + uint64_t rng_state = kRngState; LOG(INFO) << "start generate text:"; for (int t = prompt_len; t < text_length; t++) { /* ===================================== 作业 ===================================== TODO:实现单步文本生成逻辑 HINT:调用model.Forward推理获取logits,根据推理结果进行随机采样,调用Decode获取文本结果 ===================================== 作业 ===================================== */ + // prepare input on device + auto x = std::make_shared(x_tensor.To(device)); + + // forward + auto outputs = model.Forward({x}); + auto logits = outputs[0]; + + // move logits to CPU for sampling + auto logits_cpu = logits->To(infini_train::Device(infini_train::DeviceType::kCPU, 0)); + const auto &ldims = logits_cpu.Dims(); + CHECK_EQ(ldims.size(), 3); + const int B = static_cast(ldims[0]); + const int T = static_cast(ldims[1]); + const int V = static_cast(ldims[2]); + + const float *logits_ptr = static_cast(logits_cpu.DataPtr()); + + // for each batch, sample from the distribution at position t + std::vector probs(V); + for (int b = 0; b < B; ++b) { + const float *row = logits_ptr + static_cast(b * T + t) * V; + // softmax (stable) + float maxv = row[0]; + for (int i = 1; i < V; ++i) maxv = std::max(maxv, row[i]); + double sum = 0.0; + for (int i = 0; i < V; ++i) { + probs[i] = std::exp(row[i] - maxv); + sum += probs[i]; + } + for (int i = 0; i < V; ++i) probs[i] = static_cast(probs[i] / sum); + + // sample + float coin = RandomF32(rng_state); + int sampled = SampleMult(probs.data(), V, coin); + + // write sampled token into CPU buffer + x_buff[static_cast(b) * sequence_length + t] = static_cast(sampled); + + // print decoded text + std::cout << Decode(static_cast(sampled)); + } } std::cout << std::endl; } diff --git a/infini_train/include/dispatcher.h b/infini_train/include/dispatcher.h index 5b91d85..e429cbf 100644 --- a/infini_train/include/dispatcher.h +++ b/infini_train/include/dispatcher.h @@ -2,6 +2,10 @@ #include #include +#include +#define REGISTER_KERNEL_CONCAT_IMPL(a, b) a##b +#define REGISTER_KERNEL_CONCAT(a, b) REGISTER_KERNEL_CONCAT_IMPL(a, b) + #include #include @@ -17,11 +21,13 @@ class KernelFunction { template RetT Call(ArgsT... args) const { // =================================== 作业 =================================== // TODO:实现通用kernel调用接口 - // 功能描述:将存储的函数指针转换为指定类型并调用 + // 功能描述:将存储的函数指针转换为x指定类型并调用 // =================================== 作业 =================================== using FuncT = RetT (*)(ArgsT...); - // TODO: 实现函数调用逻辑 + auto f = reinterpret_cast(func_ptr_); + CHECK(f != nullptr) << "Attempt to call null kernel function"; + return f(args...); } private: @@ -48,6 +54,9 @@ class Dispatcher { // TODO:实现kernel注册机制 // 功能描述:将kernel函数与设备类型、名称绑定 // =================================== 作业 =================================== + CHECK(!key_to_kernel_map_.contains(key)) + << "Kernel already registered: " << key.second << " on device: " << static_cast(key.first); + key_to_kernel_map_.emplace(key, KernelFunction(std::forward(kernel))); } private: @@ -55,8 +64,7 @@ class Dispatcher { }; } // namespace infini_train -#define REGISTER_KERNEL(device, kernel_name, kernel_func) \ - // =================================== 作业 =================================== - // TODO:实现自动注册宏 - // 功能描述:在全局静态区注册kernel,避免显式初始化代码 - // =================================== 作业 =================================== +#define REGISTER_KERNEL(device, kernel_name, kernel_func) \ + static const int REGISTER_KERNEL_CONCAT(_reg_##kernel_name##_, __LINE__) = \ + ((void)::infini_train::Dispatcher::Instance().Register( \ + ::infini_train::Dispatcher::KeyT(device, #kernel_name), kernel_func), 0); \ No newline at end of file diff --git a/infini_train/include/tensor.h b/infini_train/include/tensor.h index a6479de..c9098b2 100644 --- a/infini_train/include/tensor.h +++ b/infini_train/include/tensor.h @@ -65,6 +65,8 @@ class TensorBuffer { void *data_ = nullptr; }; + +//Tensor的定义 class Tensor : public std::enable_shared_from_this { public: Tensor() = default; diff --git a/infini_train/src/autograd/elementwise.cc b/infini_train/src/autograd/elementwise.cc index 5a790a5..15a26af 100644 --- a/infini_train/src/autograd/elementwise.cc +++ b/infini_train/src/autograd/elementwise.cc @@ -10,8 +10,11 @@ std::vector> Neg::Forward(const std::vector>(); + CHECK_EQ(input_tensors.size(),1); + auto input = input_tensors[0]; + auto device = input->GetDevice().Type(); + auto kernel = Dispatcher::Instance().GetKernel({device, "NegForward"}); + return {kernel.Call>(input)}; } std::vector> Neg::Backward(const std::vector> &grad_outputs) { @@ -19,14 +22,16 @@ std::vector> Neg::Backward(const std::vector>(); + CHECK_EQ(grad_outputs.size(),1); + auto &grad_output = grad_outputs[0]; + auto device = grad_output->GetDevice().Type(); + auto kernel = Dispatcher::Instance().GetKernel({device,"NegBackward"}); + return {kernel.Call>(grad_output)}; } std::vector> Reciprocal::Forward(const std::vector> &input_tensors) { CHECK_EQ(input_tensors.size(), 1); const auto &input = input_tensors[0]; - auto device = input->GetDevice().Type(); auto kernel = Dispatcher::Instance().GetKernel({device, "ReciprocalForward"}); return {kernel.Call>(input)}; diff --git a/infini_train/src/kernels/cpu/accumulate_grad.cc b/infini_train/src/kernels/cpu/accumulate_grad.cc index 55637cd..94e0d08 100644 --- a/infini_train/src/kernels/cpu/accumulate_grad.cc +++ b/infini_train/src/kernels/cpu/accumulate_grad.cc @@ -18,6 +18,22 @@ void AdamAccumulateGrad(const std::shared_ptr &grad, const std::shared_p // TODO:实现Adam优化器的梯度累积和参数更新 // REF: // =================================== 作业 =================================== + const auto n = grad->NumElements(); + const float *g_ptr = static_cast(grad->DataPtr()); + float *m_ptr = static_cast(m->DataPtr()); + float *v_ptr = static_cast(v->DataPtr()); + float *p_ptr = static_cast(param->DataPtr()); + + const float bias_correction1 = 1.0f - std::pow(beta1,t); + const float bias_correction2 = 1.0f - std::pow(beta2,t); + + for(size_t i = 0;i MatmulForward(const std::shared_ptr &input, cons // TODO:实现CPU上的矩阵乘法前向计算 // REF: // =================================== 作业 =================================== + const auto &a_dims = input->Dims(); + const auto &b_dims = other->Dims(); + + CHECK_GE(a_dims.size(), 2); + CHECK_GE(b_dims.size(), 2); + + const int64_t m = a_dims[a_dims.size() - 2]; + const int64_t k = a_dims[a_dims.size() - 1]; + const int64_t n = b_dims[b_dims.size() - 1]; + + std::vector batch_dims(a_dims.begin(), a_dims.end() - 2); + std::vector b_batch_dims(b_dims.begin(), b_dims.end() - 2); + CHECK_EQ(batch_dims.size(), b_batch_dims.size()); + for (size_t i = 0; i < batch_dims.size(); ++i) CHECK_EQ(batch_dims[i], b_batch_dims[i]); + std::vector out_dims = batch_dims; + out_dims.push_back(m); + out_dims.push_back(n); + auto output = std::make_shared(out_dims, DataType::kFLOAT32); + int64_t batch_count = 1; + for (auto d : batch_dims) batch_count *= d; + + const float *a_ptr = static_cast(input->DataPtr()); + const float *b_ptr = static_cast(other->DataPtr()); + float *out_ptr = static_cast(output->DataPtr()); + + const int64_t a_block = m * k; + const int64_t b_block = k * n; + const int64_t out_block = m * n; + + for (int64_t batch = 0; batch < batch_count; ++batch) { + const float *a_block_ptr = a_ptr + batch * a_block; + const float *b_block_ptr = b_ptr + batch * b_block; + float *out_block_ptr = out_ptr + batch * out_block; + + Eigen::Map> A( + reinterpret_cast(a_block_ptr), m, k); + Eigen::Map> B( + reinterpret_cast(b_block_ptr), k, n); + Eigen::Map> C( + reinterpret_cast(out_block_ptr), m, n); + + C.noalias() = A * B; + } - auto output = std::make_shared(); - return {output}; + return output; } std::tuple, std::shared_ptr> @@ -27,9 +69,60 @@ MatmulBackward(const std::shared_ptr &input, const std::shared_ptrDims(); + const auto &b_dims = other->Dims(); + + CHECK_GE(a_dims.size(), 2); + CHECK_GE(b_dims.size(), 2); + + const int64_t m = a_dims[a_dims.size() - 2]; + const int64_t k = a_dims[a_dims.size() - 1]; + const int64_t n = b_dims[b_dims.size() - 1]; + + std::vector batch_dims(a_dims.begin(), a_dims.end() - 2); + std::vector b_batch_dims(b_dims.begin(), b_dims.end() - 2); + CHECK_EQ(batch_dims.size(), b_batch_dims.size()); + for (size_t i = 0; i < batch_dims.size(); ++i) CHECK_EQ(batch_dims[i], b_batch_dims[i]); + + auto grad_input = std::make_shared(a_dims, DataType::kFLOAT32); + auto grad_other = std::make_shared(b_dims, DataType::kFLOAT32); + + int64_t batch_count = 1; + for (auto d : batch_dims) batch_count *= d; + + const float *a_ptr = static_cast(input->DataPtr()); + const float *b_ptr = static_cast(other->DataPtr()); + const float *g_ptr = static_cast(grad_output->DataPtr()); + float *gi_ptr = static_cast(grad_input->DataPtr()); + float *go_ptr = static_cast(grad_other->DataPtr()); + + const int64_t a_block = m * k; + const int64_t b_block = k * n; + const int64_t g_block = m * n; + + for (int64_t batch = 0; batch < batch_count; ++batch) { + const float *a_block_ptr = a_ptr + batch * a_block; + const float *b_block_ptr = b_ptr + batch * b_block; + const float *g_block_ptr = g_ptr + batch * g_block; + float *gi_block_ptr = gi_ptr + batch * a_block; + float *go_block_ptr = go_ptr + batch * b_block; + + Eigen::Map> A( + reinterpret_cast(a_block_ptr), m, k); + Eigen::Map> B( + reinterpret_cast(b_block_ptr), k, n); + Eigen::Map> G( + reinterpret_cast(g_block_ptr), m, n); + + Eigen::Map> GI( + reinterpret_cast(gi_block_ptr), m, k); + Eigen::Map> GO( + reinterpret_cast(go_block_ptr), k, n); + + GI.noalias() = G * B.transpose(); + GO.noalias() = A.transpose() * G; + } - auto grad_input = std::make_shared(); - auto grad_other = std::make_shared(); return {grad_input, grad_other}; } diff --git a/infini_train/src/kernels/cuda/accumulate_grad.cu b/infini_train/src/kernels/cuda/accumulate_grad.cu index 5f977c3..a12ba4e 100644 --- a/infini_train/src/kernels/cuda/accumulate_grad.cu +++ b/infini_train/src/kernels/cuda/accumulate_grad.cu @@ -22,6 +22,20 @@ void AccumulateGrad(const std::shared_ptr &gradient, float rate, const s AccumulateGradKernel<<>>(grad_ptr, rate, tensor_ptr, num_elements); } +__global__ void AdamAccumulateGradKernel(const float *grad_ptr,float *p_ptr,float *m_ptr,float *v_ptr,float learning_rate, + float bias_correction1, float bias_correction2,float eps, int64_t t,size_t n,float beta1,float beta2){ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if(idx < n) + { + m_ptr[idx] = beta1 * m_ptr[idx] + (1 - beta1) * grad_ptr[idx]; + v_ptr[idx] = beta2 * v_ptr[idx] + (1 - beta2) * grad_ptr[idx] * grad_ptr[idx]; + float m_hat = m_ptr[idx] / bias_correction1; + float v_hat = v_ptr[idx] / bias_correction2; + p_ptr[idx] -= learning_rate * m_hat / (sqrtf(v_hat) + eps); + } + } + +// keep function inside namespace so `Tensor` (in namespace infini_train) is found unqualified void AdamAccumulateGrad(const std::shared_ptr &grad, const std::shared_ptr ¶m, const std::shared_ptr &m, const std::shared_ptr &v, float learning_rate, float beta1, float beta2, float eps, int64_t t) { @@ -29,7 +43,19 @@ void AdamAccumulateGrad(const std::shared_ptr &grad, const std::shared_p // TODO:实现Adam优化器的梯度累积和参数更新 // REF: // =================================== 作业 =================================== + const auto n = grad->NumElements(); + const float *grad_ptr = static_cast(grad->DataPtr()); + float *p_ptr = static_cast(param->DataPtr()); + float *m_ptr = static_cast(m->DataPtr()); + float *v_ptr = static_cast(v->DataPtr()); + int threads_per_block = 256; + int num_blocks = (n + threads_per_block - 1) / threads_per_block; + float bias_correction1 = 1.0f - powf(beta1, t); + float bias_correction2 = 1.0f - powf(beta2, t); + AdamAccumulateGradKernel<<>>(grad_ptr,p_ptr,m_ptr,v_ptr,learning_rate,bias_correction1,bias_correction2,eps,t,n,beta1,beta2); + } + } // namespace infini_train::kernels::cuda #define REGISTER_CUDA_ACCUMULATE_GRAD_KERNEL(kernel_name) \ diff --git a/infini_train/src/kernels/cuda/linear.cu b/infini_train/src/kernels/cuda/linear.cu index efaaaa6..c532602 100644 --- a/infini_train/src/kernels/cuda/linear.cu +++ b/infini_train/src/kernels/cuda/linear.cu @@ -28,8 +28,59 @@ std::shared_ptr MatmulForward(const std::shared_ptr &input, cons // TODO:实现CUDA上的矩阵乘法前向计算 // REF: // =================================== 作业 =================================== + const auto &input_dims = input->Dims(); + const auto &other_dims = other->Dims(); + + const int64_t m = input_dims[input_dims.size() - 2]; + const int64_t k = input_dims[input_dims.size() - 1]; + const int64_t n = other_dims[other_dims.size() - 1]; + + int64_t batch_count = 1; + std::vector leading_dims; + if (input_dims.size() > 2) { + leading_dims.assign(input_dims.begin(), input_dims.end() - 2); + for (auto d : leading_dims) batch_count *= d; + } + + std::vector output_dims; + if (!leading_dims.empty()) { + output_dims = leading_dims; + } + output_dims.push_back(m); + output_dims.push_back(n); + auto output = std::make_shared(output_dims, DataType::kFLOAT32, input->GetDevice()); + + const float alpha = 1.0f; + const float beta = 0.0f; + + cublasHandle_t handle; + CUBLAS_CHECK(cublasCreate(&handle)); - auto output = std::make_shared(); + // if no batch, do single sgemm; otherwise do per-batch sgemm + if (batch_count == 1) { + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, + static_cast(other->DataPtr()), n, + static_cast(input->DataPtr()), k, &beta, + static_cast(output->DataPtr()), n)); + } else { + // assume contiguous layout: batch-major then row-major for each matrix as used elsewhere + // input slice size = m * k, other slice size = k * n, output slice size = m * n + const int64_t in_stride = m * k; + const int64_t other_stride = k * n; + const int64_t out_stride = m * n; + const float *in_base = static_cast(input->DataPtr()); + const float *other_base = static_cast(other->DataPtr()); + float *out_base = static_cast(output->DataPtr()); + for (int64_t b = 0; b < batch_count; ++b) { + const float *in_ptr = in_base + b * in_stride; + const float *other_ptr = other_base + b * other_stride; + float *out_ptr = out_base + b * out_stride; + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, + other_ptr, n, in_ptr, k, &beta, out_ptr, n)); + } + } + + CUBLAS_CHECK(cublasDestroy(handle)); return output; } @@ -40,9 +91,34 @@ MatmulBackward(const std::shared_ptr &input, const std::shared_ptrDims(); + const auto &b_dims = other->Dims(); + + const int64_t m = a_dims[0]; + const int64_t k = a_dims[1]; + const int64_t n = b_dims[1]; + + const auto grad_input = std::make_shared(a_dims,DataType::kFLOAT32,grad_output->GetDevice()); + const auto grad_other = std::make_shared(b_dims,DataType::kFLOAT32,grad_output->GetDevice()); + + const float alpha = 1.0f; + const float beta = 0.0f; + + cublasHandle_t handle; + CUBLAS_CHECK(cublasCreate(&handle)); + CUBLAS_CHECK(cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_N,k,m,n,&alpha, + static_cast(other->DataPtr()),n, + static_cast(grad_output->DataPtr()),n, + &beta, + static_cast(grad_input->DataPtr()),k)); + + CUBLAS_CHECK(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, n, k, m, &alpha, + static_cast(grad_output->DataPtr()), n, + static_cast(input->DataPtr()), k, + &beta, + static_cast(grad_other->DataPtr()), n)); - auto grad_input = std::make_shared(); - auto grad_other = std::make_shared(); + CUBLAS_CHECK(cublasDestroy(handle)); return {grad_input, grad_other}; } @@ -225,9 +301,7 @@ LinearBackward(const std::shared_ptr &input, const std::shared_ptr>>(static_cast(grad_output->DataPtr()), static_cast(grad_bias->DataPtr()), out_features, bs); } - CUBLAS_CHECK(cublasDestroy(handle)); - return {grad_input, grad_weight, grad_bias}; } } // namespace infini_train::kernels::cuda diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc index 8f8c744..4b6fcfd 100644 --- a/infini_train/src/tensor.cc +++ b/infini_train/src/tensor.cc @@ -282,8 +282,26 @@ std::shared_ptr Tensor::Flatten(int64_t start, int64_t end) { // TODO:实现张量扁平化操作,将指定维度范围[start, end]内的所有维度合并为一个维度 // HINT: // =================================== 作业 =================================== + std::vector new_shape = dims_; + + const int64_t rank = static_cast(new_shape.size()); + if (start < 0) start += rank; + if (end < 0) end += rank; + CHECK_GE(start, 0); + CHECK_LT(start, rank); + CHECK_GE(end, 0); + CHECK_LT(end, rank); + CHECK_GE(end, start); + + int64_t new_size = 1; + for (int64_t i = start; i <= end; ++i) { + new_size *= new_shape[i]; + } - return std::make_shared(); + new_shape.erase(new_shape.begin() + start, new_shape.begin() + end + 1); + new_shape.insert(new_shape.begin() + start, new_size); + + return Contiguous()->View(new_shape); } std::shared_ptr Tensor::Squeeze(int64_t dim) { @@ -358,6 +376,35 @@ void Tensor::Backward(std::shared_ptr gradient, bool retain_graph, bool // TODO:实现自动微分反向传播 // 功能描述:1. 计算当前张量对叶子节点的梯度 2. 支持多输出场景的梯度累加 // =================================== 作业 =================================== + std::shared_ptr grad = gradient; + if (!grad) { + //传入的loss必须是一个标量 + if (NumElements() != 1) { + LOG(FATAL) << "grad must be specified for non-scalar tensor"; + } + grad = std::make_shared(dims_, dtype_, GetDevice()); + grad->Fill(1.0f); + } + + if (grad->GetDevice().Type() != GetDevice().Type()) { + grad = std::make_shared(grad->To(GetDevice())); + } + + CHECK_EQ(grad->NumElements(), NumElements()) << "gradient must have the same number of elements as tensor"; + + if (is_leaf()) { + if (!grad_) { + auto self = const_cast(this); + self->grad_ = std::make_shared(dims_, dtype_, GetDevice()); + self->grad_->Fill(0.0f); + } + auto kernel = Dispatcher::Instance().GetKernel({GetDevice().Type(), "AccumulateGrad"}); + kernel.Call(grad, 1.0f, grad_); + return; + } + if (grad_fn_) { + grad_fn_->BackwardPartial(grad, output_idx_); + } } void Tensor::ZeroGrad() {