Skip to content

Commit f6c4c27

Browse files
committed
chore: add tests, remove dead code
1 parent 6da5908 commit f6c4c27

2 files changed

Lines changed: 113 additions & 28 deletions

File tree

packages/google-cloud-bigquery-storage/google/cloud/bigquery_storage_v1/schema.py

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -56,42 +56,22 @@
5656
}
5757

5858

59-
def _is_proto_compatible(field_name: str) -> bool:
60-
"""Check if a field name is compatible with Protocol Buffer naming conventions.
61-
62-
Args:
63-
field_name: The field name to check.
64-
65-
Returns:
66-
True if the field name is proto-compatible, False otherwise.
67-
"""
68-
if not field_name:
69-
return False
70-
71-
# First character must be a letter or underscore
72-
ch = field_name[0]
73-
if not ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or ch == '_'):
74-
return False
75-
76-
# Remaining characters must be letters, digits, or underscores
77-
for ch in field_name[1:]:
78-
if not ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or
79-
ch == '_' or (ch >= '0' and ch <= '9')):
80-
return False
81-
82-
return True
83-
84-
8559
def _sanitize_field_name(field_name: str) -> str:
8660
"""Sanitize a field name to make it proto-compatible.
8761
8862
Args:
8963
field_name: The original field name.
9064
9165
Returns:
92-
The sanitized field name (lowercased).
66+
The sanitized field name.
9367
"""
94-
return field_name.lower()
68+
# Replace invalid characters with underscores.
69+
sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', field_name)
70+
# If the first character is a digit, prepend an underscore.
71+
if sanitized and sanitized[0].isdigit():
72+
sanitized = '_' + sanitized
73+
# As a convention, field names are lowercased.
74+
return sanitized.lower()
9575

9676

9777
def _get_field_label(mode: types.TableFieldSchema.Mode) -> int:

packages/google-cloud-bigquery-storage/tests/unit/test_schema.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,111 @@ def test_scope_based_naming_avoids_collisions(self):
487487
assert len(proto_descriptor.nested_type) == 1
488488
assert proto_descriptor.nested_type[0].name == "root__my_record"
489489

490+
def test_field_name_sanitization(self):
491+
"""Test that field names are sanitized to be proto-compatible."""
492+
table_schema = types.TableSchema(
493+
fields=[
494+
types.TableFieldSchema(
495+
name="field-with-hyphens",
496+
type_=types.TableFieldSchema.Type.STRING,
497+
),
498+
types.TableFieldSchema(
499+
name="field with spaces",
500+
type_=types.TableFieldSchema.Type.STRING,
501+
),
502+
types.TableFieldSchema(
503+
name="123field",
504+
type_=types.TableFieldSchema.Type.STRING,
505+
),
506+
types.TableFieldSchema(
507+
name="field@special#chars",
508+
type_=types.TableFieldSchema.Type.STRING,
509+
),
510+
types.TableFieldSchema(
511+
name="ValidField",
512+
type_=types.TableFieldSchema.Type.STRING,
513+
),
514+
]
515+
)
516+
517+
proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema)
518+
519+
# Hyphens replaced with underscores
520+
assert proto_descriptor.field[0].name == "field_with_hyphens"
521+
522+
# Spaces replaced with underscores
523+
assert proto_descriptor.field[1].name == "field_with_spaces"
524+
525+
# Field starting with digit gets prepended underscore
526+
assert proto_descriptor.field[2].name == "_123field"
527+
528+
# Special characters replaced with underscores
529+
assert proto_descriptor.field[3].name == "field_special_chars"
530+
531+
# Valid field names are lowercased
532+
assert proto_descriptor.field[4].name == "validfield"
533+
534+
def test_field_name_sanitization_in_nested_structs(self):
535+
"""Test that field name sanitization works in nested STRUCT fields."""
536+
table_schema = types.TableSchema(
537+
fields=[
538+
types.TableFieldSchema(
539+
name="outer-struct",
540+
type_=types.TableFieldSchema.Type.STRUCT,
541+
fields=[
542+
types.TableFieldSchema(
543+
name="inner-field",
544+
type_=types.TableFieldSchema.Type.STRING,
545+
),
546+
types.TableFieldSchema(
547+
name="123inner",
548+
type_=types.TableFieldSchema.Type.INT64,
549+
),
550+
],
551+
),
552+
]
553+
)
554+
555+
proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema)
556+
557+
# Outer struct field name sanitized
558+
outer_field = proto_descriptor.field[0]
559+
assert outer_field.name == "outer_struct"
560+
assert outer_field.type_name == "root__outer_struct"
561+
562+
# Nested type name sanitized
563+
nested_type = proto_descriptor.nested_type[0]
564+
assert nested_type.name == "root__outer_struct"
565+
566+
# Inner fields sanitized
567+
assert nested_type.field[0].name == "inner_field"
568+
assert nested_type.field[1].name == "_123inner"
569+
570+
def test_field_name_sanitization_in_range_fields(self):
571+
"""Test that field name sanitization works for RANGE fields."""
572+
table_schema = types.TableSchema(
573+
fields=[
574+
types.TableFieldSchema(
575+
name="date-range",
576+
type_=types.TableFieldSchema.Type.RANGE,
577+
range_element_type=types.TableFieldSchema.FieldElementType(
578+
type_=types.TableFieldSchema.Type.DATE
579+
),
580+
),
581+
]
582+
)
583+
584+
proto_descriptor = schema.table_schema_to_proto_descriptor(table_schema)
585+
586+
# Range field name sanitized
587+
range_field = proto_descriptor.field[0]
588+
assert range_field.name == "date_range"
589+
assert range_field.type_name == "root__date_range"
590+
591+
# Range type name sanitized
592+
range_type = proto_descriptor.nested_type[0]
593+
assert range_type.name == "root__date_range"
594+
490595

491596
if __name__ == "__main__":
492597
pytest.main([__file__])

0 commit comments

Comments
 (0)