Skip to content

Commit d1c4453

Browse files
committed
feat!: support unicode characters
1 parent a255e17 commit d1c4453

File tree

5 files changed

+58
-8
lines changed

5 files changed

+58
-8
lines changed

.circleci/continue_config.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ workflows:
300300
name: cloud_engine_<< matrix.engine >>
301301
context:
302302
- sqlmesh_cloud_database_integration
303-
requires:
304-
- engine_tests_docker
303+
# requires:
304+
# - engine_tests_docker
305305
matrix:
306306
parameters:
307307
engine:
@@ -313,10 +313,10 @@ workflows:
313313
- athena
314314
- fabric
315315
- gcp-postgres
316-
filters:
317-
branches:
318-
only:
319-
- main
316+
# filters:
317+
# branches:
318+
# only:
319+
# - main
320320
- ui_style
321321
- ui_test
322322
- vscode_test

sqlmesh/utils/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import types
1414
import typing as t
1515
import uuid
16+
import unicodedata
1617
from dataclasses import dataclass
1718
from collections import defaultdict
1819
from contextlib import contextmanager
@@ -289,11 +290,13 @@ def sqlglot_dialects() -> str:
289290
return "'" + "', '".join(Dialects.__members__.values()) + "'"
290291

291292

292-
NON_ALNUM = re.compile(r"[^a-zA-Z0-9_]")
293+
NON_WORD = re.compile(r"\W", flags=re.UNICODE)
293294

294295

295296
def sanitize_name(name: str) -> str:
296-
return NON_ALNUM.sub("_", name)
297+
s = unicodedata.normalize("NFC", name)
298+
s = NON_WORD.sub("_", s)
299+
return s
297300

298301

299302
def groupby(

tests/core/engine_adapter/integration/test_integration.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3990,3 +3990,27 @@ def _set_config(gateway: str, config: Config) -> None:
39903990
was_evaluated=True,
39913991
day_delta=4,
39923992
)
3993+
3994+
3995+
def test_unicode_characters(ctx: TestContext, tmp_path: Path):
3996+
model_name = "客户数据"
3997+
table = ctx.table(model_name).sql(dialect=ctx.dialect)
3998+
(tmp_path / "models").mkdir(exist_ok=True)
3999+
4000+
model_def = f"""
4001+
MODEL (
4002+
name {table},
4003+
kind FULL,
4004+
dialect '{ctx.dialect}'
4005+
);
4006+
SELECT 1 as id
4007+
"""
4008+
4009+
(tmp_path / "models" / "客户数据.sql").write_text(model_def)
4010+
4011+
context = ctx.create_context(path=tmp_path)
4012+
context.plan(auto_apply=True, no_prompts=True)
4013+
4014+
results = ctx.get_metadata_results()
4015+
assert len(results.views) == 1
4016+
assert results.views[0].lower() == model_name

tests/utils/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pytest
2+
3+
from sqlmesh.utils import sanitize_name
4+
5+
6+
@pytest.mark.parametrize(
7+
"raw,expected",
8+
[
9+
("simple", "simple"),
10+
("snake_case", "snake_case"),
11+
("客户数据", "客户数据"), # pure Chinese kept
12+
("客户-数据 v2", "客户_数据_v2"), # dash/space -> underscore
13+
("中文,逗号", "中文_逗号"), # full-width comma -> underscore
14+
("a/b", "a_b"), # slash -> underscore
15+
("spaces\tand\nnewlines", "spaces_and_newlines"),
16+
("data📦2025", "data_2025"),
17+
("MiXeD123_名字", "MiXeD123_名字"),
18+
("", ""),
19+
],
20+
)
21+
def test_sanitize_known_cases(raw, expected):
22+
assert sanitize_name(raw) == expected

tests/utils/test_cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def test_file_cache(tmp_path: Path, mocker: MockerFixture):
3939
loader.assert_called_once()
4040

4141
assert "___test_model_" in cache._cache_entry_path('"test_model"').name
42+
assert "客户数据" in cache._cache_entry_path("客户数据").name
4243

4344

4445
def test_optimized_query_cache(tmp_path: Path, mocker: MockerFixture):

0 commit comments

Comments
 (0)