Skip to content

Commit f195d52

Browse files
adams381andykaylor
andauthored
[CIR] Support wide string literals in CIR codegen (#171541)
This PR migrates support for wide string literals from the incubator to upstream. ## Changes - Implement wide string literal support in `getConstantArrayFromStringLiteral` - Handle wchar_t, char16_t, and char32_t string literals - Collect code units and create constant arrays with IntAttr elements - Use ZeroAttr for null-filled strings ## Testing - Copied `wide-string.cpp` test file from incubator - Expanded test to include wchar_t test cases (incubator only had char16_t and char32_t) - All tests pass --------- Co-authored-by: Andy Kaylor <akaylor@nvidia.com>
1 parent 0d53746 commit f195d52

File tree

2 files changed

+100
-3
lines changed

2 files changed

+100
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenModule.cpp

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
#include "clang/AST/GlobalDecl.h"
2222
#include "clang/AST/RecordLayout.h"
2323
#include "clang/Basic/SourceManager.h"
24+
#include "clang/CIR/Dialect/IR/CIRAttrs.h"
2425
#include "clang/CIR/Dialect/IR/CIRDialect.h"
26+
#include "clang/CIR/Dialect/IR/CIRTypes.h"
2527
#include "clang/CIR/Interfaces/CIROpInterfaces.h"
2628
#include "clang/CIR/MissingFeatures.h"
2729

@@ -31,6 +33,8 @@
3133
#include "mlir/IR/MLIRContext.h"
3234
#include "mlir/IR/Verifier.h"
3335

36+
#include <algorithm>
37+
3438
using namespace clang;
3539
using namespace clang::CIRGen;
3640

@@ -962,9 +966,39 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) {
962966
return builder.getString(str, eltTy, finalSize);
963967
}
964968

965-
errorNYI(e->getSourceRange(),
966-
"getConstantArrayFromStringLiteral: wide characters");
967-
return mlir::Attribute();
969+
auto arrayTy = mlir::cast<cir::ArrayType>(convertType(e->getType()));
970+
971+
auto arrayEltTy = mlir::cast<cir::IntType>(arrayTy.getElementType());
972+
973+
uint64_t arraySize = arrayTy.getSize();
974+
unsigned literalSize = e->getLength();
975+
assert(arraySize == literalSize + 1 &&
976+
"wide string literal array size must be literal length plus null "
977+
"terminator");
978+
979+
// Check if the string is all null bytes before building the vector.
980+
// In most non-zero cases, this will break out on the first element.
981+
bool isAllZero = true;
982+
for (unsigned i = 0; i < literalSize; ++i) {
983+
if (e->getCodeUnit(i) != 0) {
984+
isAllZero = false;
985+
break;
986+
}
987+
}
988+
989+
if (isAllZero)
990+
return cir::ZeroAttr::get(arrayTy);
991+
992+
// Otherwise emit a constant array holding the characters.
993+
SmallVector<mlir::Attribute> elements;
994+
elements.reserve(arraySize);
995+
for (unsigned i = 0; i < literalSize; ++i)
996+
elements.push_back(cir::IntAttr::get(arrayEltTy, e->getCodeUnit(i)));
997+
// Add null terminator
998+
elements.push_back(cir::IntAttr::get(arrayEltTy, 0));
999+
1000+
auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements);
1001+
return builder.getConstArray(elementsAttr, arrayTy);
9681002
}
9691003

9701004
bool CIRGenModule::supportsCOMDAT() const {
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
2+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
3+
// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
4+
// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
5+
// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
6+
// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
7+
8+
// Test with built-in char16_t type
9+
const char16_t *test_utf16() {
10+
return u"你好世界";
11+
}
12+
13+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5>
14+
// LLVM: @{{.+}} = private constant [5 x i16] [i16 20320, i16 22909, i16 19990, i16 30028, i16 0]
15+
// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] [i16 20320, i16 22909, i16 19990, i16 30028, i16 0]
16+
17+
const char32_t *test_utf32() {
18+
return U"你好世界";
19+
}
20+
21+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5>
22+
// LLVM: @{{.+}} = private constant [5 x i32] [i32 20320, i32 22909, i32 19990, i32 30028, i32 0]
23+
// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] [i32 20320, i32 22909, i32 19990, i32 30028, i32 0]
24+
25+
const char16_t *test_zero16() {
26+
return u"\0\0\0\0";
27+
}
28+
29+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5>
30+
// LLVM: @{{.+}} = private constant [5 x i16] zeroinitializer
31+
// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] zeroinitializer
32+
33+
const char32_t *test_zero32() {
34+
return U"\0\0\0\0";
35+
}
36+
37+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5>
38+
// LLVM: @{{.+}} = private constant [5 x i32] zeroinitializer
39+
// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] zeroinitializer
40+
41+
const wchar_t *test_wchar() {
42+
return L"1234";
43+
}
44+
45+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5>
46+
// LLVM: @{{.+}} = private constant [5 x i32] [i32 49, i32 50, i32 51, i32 52, i32 0]
47+
// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] [i32 49, i32 50, i32 51, i32 52, i32 0]
48+
49+
const wchar_t *test_wchar_zero() {
50+
return L"";
51+
}
52+
53+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1>
54+
// LLVM: @{{.+}} = private constant [1 x i32] zeroinitializer
55+
// OGCG: @{{.+}} = private unnamed_addr constant [1 x i32] zeroinitializer
56+
57+
const char16_t *test_char16_typedef() {
58+
return u"test";
59+
}
60+
61+
// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<116> : !u16i, #cir.int<101> : !u16i, #cir.int<115> : !u16i, #cir.int<116> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5>
62+
// LLVM: @{{.+}} = private constant [5 x i16] [i16 116, i16 101, i16 115, i16 116, i16 0]
63+
// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] [i16 116, i16 101, i16 115, i16 116, i16 0]

0 commit comments

Comments
 (0)