Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 8af4728

Browse files
committed
Merge remote-tracking branch 'upstream/master' into bigquery-dbt-impersonation
2 parents ca51e04 + 435e7c0 commit 8af4728

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+5218
-5680
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ jobs:
1818
matrix:
1919
os: [ubuntu-latest]
2020
python-version:
21-
- "3.7"
2221
- "3.8"
2322
- "3.9"
2423
- "3.10"
24+
- "3.11"
2525

2626
name: Check Python ${{ matrix.python-version }} on ${{ matrix.os }}
2727
runs-on: ${{ matrix.os }}

.github/workflows/ci_full.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
matrix:
2222
os: [ubuntu-latest]
2323
python-version:
24-
- "3.10"
24+
- "3.11"
2525

2626
name: Check Python ${{ matrix.python-version }} on ${{ matrix.os }}
2727
runs-on: ${{ matrix.os }}

data_diff/__init__.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from typing import Sequence, Tuple, Iterator, Optional, Union
22

3-
from data_diff.sqeleton.abcs import DbTime, DbPath
4-
5-
from .tracking import disable_tracking
6-
from .databases import connect
7-
from .diff_tables import Algorithm
8-
from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
9-
from .joindiff_tables import JoinDiffer, TABLE_WRITE_LIMIT
10-
from .table_segment import TableSegment
11-
from .utils import eval_name_template, Vector
3+
from data_diff.abcs.database_types import DbTime, DbPath
4+
from data_diff.databases import Database
5+
from data_diff.tracking import disable_tracking
6+
from data_diff.databases._connect import connect
7+
from data_diff.diff_tables import Algorithm
8+
from data_diff.hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
9+
from data_diff.joindiff_tables import JoinDiffer, TABLE_WRITE_LIMIT
10+
from data_diff.table_segment import TableSegment
11+
from data_diff.utils import eval_name_template, Vector
1212

1313

1414
def connect_to_table(
@@ -32,10 +32,10 @@ def connect_to_table(
3232
if isinstance(key_columns, str):
3333
key_columns = (key_columns,)
3434

35-
db = connect(db_info, thread_count=thread_count)
35+
db: Database = connect(db_info, thread_count=thread_count)
3636

3737
if isinstance(table_name, str):
38-
table_name = db.parse_table_name(table_name)
38+
table_name = db.dialect.parse_table_name(table_name)
3939

4040
return TableSegment(db, table_name, key_columns, **kwargs)
4141

@@ -162,7 +162,8 @@ def diff_tables(
162162
)
163163
elif algorithm == Algorithm.JOINDIFF:
164164
if isinstance(materialize_to_table, str):
165-
materialize_to_table = table1.database.parse_table_name(eval_name_template(materialize_to_table))
165+
table_name = eval_name_template(materialize_to_table)
166+
materialize_to_table = table1.database.dialect.parse_table_name(table_name)
166167
differ = JoinDiffer(
167168
threaded=threaded,
168169
max_threadpool_size=max_threadpool_size,

data_diff/__main__.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,27 @@
66
import json
77
import logging
88
from itertools import islice
9-
from typing import Dict, Optional
9+
from typing import Dict, Optional, Tuple
1010

1111
import rich
1212
from rich.logging import RichHandler
1313
import click
1414

15-
from data_diff.sqeleton.schema import create_schema
16-
from data_diff.sqeleton.queries.api import current_timestamp
15+
from data_diff import Database
16+
from data_diff.schema import create_schema
17+
from data_diff.queries.api import current_timestamp
1718

18-
from .dbt import dbt_diff
19-
from .utils import eval_name_template, remove_password_from_url, safezip, match_like, LogStatusHandler
20-
from .diff_tables import Algorithm
21-
from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
22-
from .joindiff_tables import TABLE_WRITE_LIMIT, JoinDiffer
23-
from .table_segment import TableSegment
24-
from .databases import connect
25-
from .parse_time import parse_time_before, UNITS_STR, ParseError
26-
from .config import apply_config_from_file
27-
from .tracking import disable_tracking, set_entrypoint_name
28-
from .version import __version__
19+
from data_diff.dbt import dbt_diff
20+
from data_diff.utils import eval_name_template, remove_password_from_url, safezip, match_like, LogStatusHandler
21+
from data_diff.diff_tables import Algorithm
22+
from data_diff.hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
23+
from data_diff.joindiff_tables import TABLE_WRITE_LIMIT, JoinDiffer
24+
from data_diff.table_segment import TableSegment
25+
from data_diff.databases._connect import connect
26+
from data_diff.parse_time import parse_time_before, UNITS_STR, ParseError
27+
from data_diff.config import apply_config_from_file
28+
from data_diff.tracking import disable_tracking, set_entrypoint_name
29+
from data_diff.version import __version__
2930

3031

3132
COLOR_SCHEME = {
@@ -326,8 +327,7 @@ def main(conf, run, **kw):
326327
)
327328
except Exception as e:
328329
logging.error(e)
329-
if kw["debug"]:
330-
raise
330+
raise
331331

332332

333333
def _data_diff(
@@ -425,7 +425,7 @@ def _data_diff(
425425
logging.error(f"Error while parsing age expression: {e}")
426426
return
427427

428-
dbs = db1, db2
428+
dbs: Tuple[Database, Database] = db1, db2
429429

430430
if interactive:
431431
for db in dbs:
@@ -444,7 +444,7 @@ def _data_diff(
444444
materialize_all_rows=materialize_all_rows,
445445
table_write_limit=table_write_limit,
446446
materialize_to_table=materialize_to_table
447-
and db1.parse_table_name(eval_name_template(materialize_to_table)),
447+
and db1.dialect.parse_table_name(eval_name_template(materialize_to_table)),
448448
)
449449
else:
450450
assert algorithm == Algorithm.HASHDIFF
@@ -456,11 +456,11 @@ def _data_diff(
456456
)
457457

458458
table_names = table1, table2
459-
table_paths = [db.parse_table_name(t) for db, t in safezip(dbs, table_names)]
459+
table_paths = [db.dialect.parse_table_name(t) for db, t in safezip(dbs, table_names)]
460460

461461
schemas = list(differ._thread_map(_get_schema, safezip(dbs, table_paths)))
462462
schema1, schema2 = schemas = [
463-
create_schema(db, table_path, schema, case_sensitive)
463+
create_schema(db.name, table_path, schema, case_sensitive)
464464
for db, table_path, schema in safezip(dbs, table_paths, schemas)
465465
]
466466

data_diff/abcs/compiler.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from abc import ABC
2+
3+
import attrs
4+
5+
6+
@attrs.define(frozen=False)
7+
class AbstractCompiler(ABC):
8+
pass
9+
10+
11+
@attrs.define(frozen=False, eq=False)
12+
class Compilable(ABC):
13+
pass

data_diff/abcs/database_types.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
import decimal
2+
from abc import ABC, abstractmethod
3+
from typing import Tuple, Union
4+
from datetime import datetime
5+
6+
import attrs
7+
8+
from data_diff.utils import ArithAlphanumeric, ArithUUID, Unknown
9+
10+
11+
DbPath = Tuple[str, ...]
12+
DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric]
13+
DbTime = datetime
14+
15+
16+
@attrs.define(frozen=True)
17+
class ColType:
18+
@property
19+
def supported(self) -> bool:
20+
return True
21+
22+
23+
@attrs.define(frozen=True)
24+
class PrecisionType(ColType):
25+
precision: int
26+
rounds: Union[bool, Unknown] = Unknown
27+
28+
29+
@attrs.define(frozen=True)
30+
class Boolean(ColType):
31+
precision = 0
32+
33+
34+
@attrs.define(frozen=True)
35+
class TemporalType(PrecisionType):
36+
pass
37+
38+
39+
@attrs.define(frozen=True)
40+
class Timestamp(TemporalType):
41+
pass
42+
43+
44+
@attrs.define(frozen=True)
45+
class TimestampTZ(TemporalType):
46+
pass
47+
48+
49+
@attrs.define(frozen=True)
50+
class Datetime(TemporalType):
51+
pass
52+
53+
54+
@attrs.define(frozen=True)
55+
class Date(TemporalType):
56+
pass
57+
58+
59+
@attrs.define(frozen=True)
60+
class NumericType(ColType):
61+
# 'precision' signifies how many fractional digits (after the dot) we want to compare
62+
precision: int
63+
64+
65+
@attrs.define(frozen=True)
66+
class FractionalType(NumericType):
67+
pass
68+
69+
70+
@attrs.define(frozen=True)
71+
class Float(FractionalType):
72+
python_type = float
73+
74+
75+
@attrs.define(frozen=True)
76+
class IKey(ABC):
77+
"Interface for ColType, for using a column as a key in table."
78+
79+
@property
80+
@abstractmethod
81+
def python_type(self) -> type:
82+
"Return the equivalent Python type of the key"
83+
84+
def make_value(self, value):
85+
return self.python_type(value)
86+
87+
88+
@attrs.define(frozen=True)
89+
class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
90+
@property
91+
def python_type(self) -> type:
92+
if self.precision == 0:
93+
return int
94+
return decimal.Decimal
95+
96+
97+
@attrs.define(frozen=True)
98+
class StringType(ColType):
99+
python_type = str
100+
101+
102+
@attrs.define(frozen=True)
103+
class ColType_UUID(ColType, IKey):
104+
python_type = ArithUUID
105+
106+
107+
@attrs.define(frozen=True)
108+
class ColType_Alphanum(ColType, IKey):
109+
python_type = ArithAlphanumeric
110+
111+
112+
@attrs.define(frozen=True)
113+
class Native_UUID(ColType_UUID):
114+
pass
115+
116+
117+
@attrs.define(frozen=True)
118+
class String_UUID(ColType_UUID, StringType):
119+
pass
120+
121+
122+
@attrs.define(frozen=True)
123+
class String_Alphanum(ColType_Alphanum, StringType):
124+
@staticmethod
125+
def test_value(value: str) -> bool:
126+
try:
127+
ArithAlphanumeric(value)
128+
return True
129+
except ValueError:
130+
return False
131+
132+
def make_value(self, value):
133+
return self.python_type(value)
134+
135+
136+
@attrs.define(frozen=True)
137+
class String_VaryingAlphanum(String_Alphanum):
138+
pass
139+
140+
141+
@attrs.define(frozen=True)
142+
class String_FixedAlphanum(String_Alphanum):
143+
length: int
144+
145+
def make_value(self, value):
146+
if len(value) != self.length:
147+
raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
148+
return self.python_type(value, max_len=self.length)
149+
150+
151+
@attrs.define(frozen=True)
152+
class Text(StringType):
153+
@property
154+
def supported(self) -> bool:
155+
return False
156+
157+
158+
# In majority of DBMSes, it is called JSON/JSONB. Only in Snowflake, it is OBJECT.
159+
@attrs.define(frozen=True)
160+
class JSON(ColType):
161+
pass
162+
163+
164+
@attrs.define(frozen=True)
165+
class Array(ColType):
166+
item_type: ColType
167+
168+
169+
# Unlike JSON, structs are not free-form and have a very specific set of fields and their types.
170+
# We do not parse & use those fields now, but we can do this later.
171+
# For example, in BigQuery:
172+
# - https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
173+
# - https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#struct_literals
174+
@attrs.define(frozen=True)
175+
class Struct(ColType):
176+
pass
177+
178+
179+
@attrs.define(frozen=True)
180+
class Integer(NumericType, IKey):
181+
precision: int = 0
182+
python_type: type = int
183+
184+
def __attrs_post_init__(self):
185+
assert self.precision == 0
186+
187+
188+
@attrs.define(frozen=True)
189+
class UnknownColType(ColType):
190+
text: str
191+
192+
@property
193+
def supported(self) -> bool:
194+
return False

0 commit comments

Comments
 (0)