Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
500 changes: 492 additions & 8 deletions docs/TinyInfiniTrain 作业报告.md

Large diffs are not rendered by default.

Binary file added docs/image-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/image-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/image-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
58 changes: 58 additions & 0 deletions example/common/tiny_shakespeare_dataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,59 @@ TinyShakespeareFile ReadTinyShakespeareFile(const std::string &path, size_t sequ
| magic(4B) | version(4B) | num_toks(4B) | reserved(1012B) | token数据 |
----------------------------------------------------------------------------------
=================================== 作业 =================================== */

TinyShakespeareFile out;
if (!std::filesystem::exists(path)) {
LOG(FATAL) << "File not found: " << path;
}

std::ifstream ifs(path, std::ios::binary);
CHECK(ifs.is_open()) << "Failed to open file: " << path;

const size_t header_bytes = 1024;
auto header = ReadSeveralBytesFromIfstream(header_bytes, &ifs);

const auto version = BytesToType<uint32_t>(header, 4);
CHECK(kTypeMap.find(static_cast<int>(version)) != kTypeMap.end())
<< "Unsupported tiny shakespeare version: " << version;
const auto type = kTypeMap.at(static_cast<int>(version));
out.type = type;

const auto num_toks = BytesToType<uint32_t>(header, 8);
CHECK_GT(num_toks, 0U);

const size_t orig_type_size = kTypeToSize.at(type);

const size_t token_bytes = static_cast<size_t>(num_toks) * orig_type_size;
std::vector<uint8_t> tokens_bytes(token_bytes);
ifs.read(reinterpret_cast<char *>(tokens_bytes.data()), token_bytes);
CHECK_EQ(static_cast<size_t>(ifs.gcount()), token_bytes) << "Failed to read token data";

const size_t sample_stride = sequence_length + 1; // each sample holds seq_len + 1 tokens
const size_t num_samples = num_toks / sample_stride;
CHECK_GT(num_samples, 0U) << "Not enough tokens for given sequence_length";

std::vector<int64_t> storage(num_samples * sample_stride);

for (size_t i = 0; i < num_toks; ++i) {
int64_t val = 0;
if (orig_type_size == 2) {
val = static_cast<int64_t>(BytesToType<uint16_t>(tokens_bytes, i * orig_type_size));
} else if (orig_type_size == 4) {
val = static_cast<int64_t>(BytesToType<uint32_t>(tokens_bytes, i * orig_type_size));
} else {
LOG(FATAL) << "Unsupported token size: " << orig_type_size;
}
storage[i] = val;
}

const std::vector<int64_t> backing_dims = {static_cast<int64_t>(num_samples * sample_stride)};
out.tensor = infini_train::Tensor(backing_dims, infini_train::DataType::kINT64);
memcpy(out.tensor.DataPtr(), storage.data(), storage.size() * sizeof(int64_t));

out.dims = {static_cast<int64_t>(num_samples), static_cast<int64_t>(sequence_length)};

return out;
}
} // namespace

Expand All @@ -69,6 +122,11 @@ TinyShakespeareDataset::TinyShakespeareDataset(const std::string &filepath, size
// TODO:初始化数据集实例
// HINT: 调用ReadTinyShakespeareFile加载数据文件
// =================================== 作业 ===================================
text_file_ = ReadTinyShakespeareFile(filepath, sequence_length);
sequence_length_ = sequence_length;
const size_t sample_stride = sequence_length + 1;
sequence_size_in_bytes_ = sample_stride * sizeof(int64_t);
num_samples_ = static_cast<size_t>(text_file_.dims[0]);
}

std::pair<std::shared_ptr<infini_train::Tensor>, std::shared_ptr<infini_train::Tensor>>
Expand Down
6 changes: 3 additions & 3 deletions example/common/tiny_shakespeare_dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TinyShakespeareDataset : public infini_train::Dataset {

private:
TinyShakespeareFile text_file_;
const size_t sequence_length_ = 0;
const size_t sequence_size_in_bytes_ = 0;
const size_t num_samples_ = 0;
size_t sequence_length_ = 0;
size_t sequence_size_in_bytes_ = 0;
size_t num_samples_ = 0;
};
119 changes: 116 additions & 3 deletions example/common/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,87 @@ Tokenizer::Tokenizer(const std::string &filepath) {
| magic(4B) | version(4B) | vocab_size(4B) | reserved(1012B) | token词表数据 |
----------------------------------------------------------------------------------
===================================== 作业 ===================================== */

if (!std::filesystem::exists(filepath)) {
LOG(FATAL) << "Tokenizer file not found: " << filepath;
}

std::ifstream ifs(filepath, std::ios::binary);
CHECK(ifs.is_open()) << "Failed to open tokenizer file: " << filepath;

const size_t header_bytes = 1024;
auto header = ReadSeveralBytesFromIfstream(header_bytes, &ifs);

const auto file_magic = BytesToType<uint32_t>(header, 0);
const auto version = BytesToType<uint32_t>(header, 4);
const auto vocab_size = BytesToType<uint32_t>(header, 8);

magic_number_ = version;
vocab_size_ = vocab_size;

const auto file_size = std::filesystem::file_size(filepath);
const size_t remaining = (file_size > header_bytes) ? static_cast<size_t>(file_size - header_bytes) : 0;
CHECK_GT(remaining, 0u) << "Empty vocab table in tokenizer file";

std::vector<uint8_t> table_bytes(remaining);
ifs.read(reinterpret_cast<char *>(table_bytes.data()), remaining);

size_t pos = 0;
token_table_.reserve(vocab_size_);

bool parsed = false;
if (remaining >= 4) {
const uint32_t first_len = BytesToType<uint32_t>(table_bytes, 0);
if (first_len > 0 && first_len < remaining) {
pos = 0;
try {
for (uint32_t i = 0; i < vocab_size_ && pos + 4 <= remaining; ++i) {
uint32_t len = BytesToType<uint32_t>(table_bytes, pos);
pos += 4;
CHECK_LE(pos + len, remaining) << "Tokenizer entry length overflow";
std::string token(reinterpret_cast<const char *>(&table_bytes[pos]), len);
token_table_.push_back(std::move(token));
pos += len;
}
if (token_table_.size() == vocab_size_) parsed = true;
} catch (...) {
parsed = false;
}
}
}

if (!parsed) {
token_table_.clear();
std::string cur;
for (size_t i = 0; i < table_bytes.size() && token_table_.size() < vocab_size_; ++i) {
if (table_bytes[i] == '\0') {
token_table_.push_back(cur);
cur.clear();
} else {
cur.push_back(static_cast<char>(table_bytes[i]));
}
}
if (!cur.empty() && token_table_.size() < vocab_size_) token_table_.push_back(cur);
}

if (token_table_.size() > vocab_size_) token_table_.resize(vocab_size_);
while (token_table_.size() < vocab_size_) token_table_.push_back("");

auto it = kEotMap.find(magic_number_);
if (it != kEotMap.end()) {
eot_token_ = it->second;
} else {
eot_token_ = kGpt2Eot; // default
}
}

std::string Tokenizer::Decode(uint32_t token_id) const {
/* ===================================== 作业 =====================================
TODO:实现token_id到文本的转换
功能描述:根据token_id返回对应的文本片段
===================================== 作业 ===================================== */
return "";
if (token_id >= token_table_.size()) return std::string();
return token_table_[token_id];
}

void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_size, uint32_t sequence_length,
Expand All @@ -103,14 +176,54 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz
for (int i = 0; i < prompt_len; ++i) { x_buff[i] = prompt[i]; }
std::cout << "The meaning of life is";

auto x = std::make_shared<infini_train::Tensor>(x_tensor.To(device));
uint64_t kRngState = kRngState;
uint64_t rng_state = kRngState;
LOG(INFO) << "start generate text:";
for (int t = prompt_len; t < text_length; t++) {
/* ===================================== 作业 =====================================
TODO:实现单步文本生成逻辑
HINT:调用model.Forward推理获取logits,根据推理结果进行随机采样,调用Decode获取文本结果
===================================== 作业 ===================================== */
// prepare input on device
auto x = std::make_shared<infini_train::Tensor>(x_tensor.To(device));

// forward
auto outputs = model.Forward({x});
auto logits = outputs[0];

// move logits to CPU for sampling
auto logits_cpu = logits->To(infini_train::Device(infini_train::DeviceType::kCPU, 0));
const auto &ldims = logits_cpu.Dims();
CHECK_EQ(ldims.size(), 3);
const int B = static_cast<int>(ldims[0]);
const int T = static_cast<int>(ldims[1]);
const int V = static_cast<int>(ldims[2]);

const float *logits_ptr = static_cast<const float *>(logits_cpu.DataPtr());

// for each batch, sample from the distribution at position t
std::vector<float> probs(V);
for (int b = 0; b < B; ++b) {
const float *row = logits_ptr + static_cast<size_t>(b * T + t) * V;
// softmax (stable)
float maxv = row[0];
for (int i = 1; i < V; ++i) maxv = std::max(maxv, row[i]);
double sum = 0.0;
for (int i = 0; i < V; ++i) {
probs[i] = std::exp(row[i] - maxv);
sum += probs[i];
}
for (int i = 0; i < V; ++i) probs[i] = static_cast<float>(probs[i] / sum);

// sample
float coin = RandomF32(rng_state);
int sampled = SampleMult(probs.data(), V, coin);

// write sampled token into CPU buffer
x_buff[static_cast<size_t>(b) * sequence_length + t] = static_cast<int64_t>(sampled);

// print decoded text
std::cout << Decode(static_cast<uint32_t>(sampled));
}
}
std::cout << std::endl;
}
Expand Down
22 changes: 15 additions & 7 deletions infini_train/include/dispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

#include <iostream>
#include <map>
#include <string>
#define REGISTER_KERNEL_CONCAT_IMPL(a, b) a##b
#define REGISTER_KERNEL_CONCAT(a, b) REGISTER_KERNEL_CONCAT_IMPL(a, b)

#include <type_traits>
#include <utility>

Expand All @@ -17,11 +21,13 @@ class KernelFunction {
template <typename RetT, class... ArgsT> RetT Call(ArgsT... args) const {
// =================================== 作业 ===================================
// TODO:实现通用kernel调用接口
// 功能描述:将存储的函数指针转换为指定类型并调用
// 功能描述:将存储的函数指针转换为x指定类型并调用
// =================================== 作业 ===================================

using FuncT = RetT (*)(ArgsT...);
// TODO: 实现函数调用逻辑
auto f = reinterpret_cast<FuncT>(func_ptr_);
CHECK(f != nullptr) << "Attempt to call null kernel function";
return f(args...);
}

private:
Expand All @@ -48,15 +54,17 @@ class Dispatcher {
// TODO:实现kernel注册机制
// 功能描述:将kernel函数与设备类型、名称绑定
// =================================== 作业 ===================================
CHECK(!key_to_kernel_map_.contains(key))
<< "Kernel already registered: " << key.second << " on device: " << static_cast<int>(key.first);
key_to_kernel_map_.emplace(key, KernelFunction(std::forward<FuncT>(kernel)));
}

private:
std::map<KeyT, KernelFunction> key_to_kernel_map_;
};
} // namespace infini_train

#define REGISTER_KERNEL(device, kernel_name, kernel_func) \
// =================================== 作业 ===================================
// TODO:实现自动注册宏
// 功能描述:在全局静态区注册kernel,避免显式初始化代码
// =================================== 作业 ===================================
#define REGISTER_KERNEL(device, kernel_name, kernel_func) \
static const int REGISTER_KERNEL_CONCAT(_reg_##kernel_name##_, __LINE__) = \
((void)::infini_train::Dispatcher::Instance().Register( \
::infini_train::Dispatcher::KeyT(device, #kernel_name), kernel_func), 0);
2 changes: 2 additions & 0 deletions infini_train/include/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class TensorBuffer {
void *data_ = nullptr;
};


//Tensor的定义
class Tensor : public std::enable_shared_from_this<Tensor> {
public:
Tensor() = default;
Expand Down
15 changes: 10 additions & 5 deletions infini_train/src/autograd/elementwise.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,28 @@ std::vector<std::shared_ptr<Tensor>> Neg::Forward(const std::vector<std::shared_
// TODO:通过Dispatcher获取设备专属kernel,对输入张量进行取反操作
// NOTES: 依赖test_dispatcher,Neg kernel实现已给出
// =================================== 作业 ===================================

return std::vector<std::shared_ptr<Tensor>>();
CHECK_EQ(input_tensors.size(),1);
auto input = input_tensors[0];
auto device = input->GetDevice().Type();
auto kernel = Dispatcher::Instance().GetKernel({device, "NegForward"});
return {kernel.Call<std::shared_ptr<Tensor>>(input)};
}

std::vector<std::shared_ptr<Tensor>> Neg::Backward(const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
// =================================== 作业 ===================================
// TODO:通过Dispatcher获取设备专属的反向传播kernel,计算梯度
// NOTES: 依赖test_dispatcher,Neg的kernel实现已给出
// =================================== 作业 ===================================

return std::vector<std::shared_ptr<Tensor>>();
CHECK_EQ(grad_outputs.size(),1);
auto &grad_output = grad_outputs[0];
auto device = grad_output->GetDevice().Type();
auto kernel = Dispatcher::Instance().GetKernel({device,"NegBackward"});
return {kernel.Call<std::shared_ptr<Tensor>>(grad_output)};
}

std::vector<std::shared_ptr<Tensor>> Reciprocal::Forward(const std::vector<std::shared_ptr<Tensor>> &input_tensors) {
CHECK_EQ(input_tensors.size(), 1);
const auto &input = input_tensors[0];

auto device = input->GetDevice().Type();
auto kernel = Dispatcher::Instance().GetKernel({device, "ReciprocalForward"});
return {kernel.Call<std::shared_ptr<Tensor>>(input)};
Expand Down
16 changes: 16 additions & 0 deletions infini_train/src/kernels/cpu/accumulate_grad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ void AdamAccumulateGrad(const std::shared_ptr<Tensor> &grad, const std::shared_p
// TODO:实现Adam优化器的梯度累积和参数更新
// REF:
// =================================== 作业 ===================================
const auto n = grad->NumElements();
const float *g_ptr = static_cast<const float *>(grad->DataPtr());
float *m_ptr = static_cast<float*>(m->DataPtr());
float *v_ptr = static_cast<float*>(v->DataPtr());
float *p_ptr = static_cast<float *>(param->DataPtr());

const float bias_correction1 = 1.0f - std::pow(beta1,t);
const float bias_correction2 = 1.0f - std::pow(beta2,t);

for(size_t i = 0;i<n;i++){
m_ptr[i] = beta1 * m_ptr[i] + (1-beta1) * g_ptr[i];
v_ptr[i] = beta2 * v_ptr[i] + (1-beta2) * g_ptr[i] * g_ptr[i];
float m_hat = m_ptr[i] / bias_correction1;
float v_hat = v_ptr[i] / bias_correction2;
p_ptr[i] -= learning_rate * m_hat / (std::sqrt(v_hat) + eps);
}
}

} // namespace infini_train::kernels::cpu
Expand Down
Loading