Skip to content

Commit 002703c

Browse files
authored
Merge pull request #80 from OpenDataAlex/process_tracker_python-71
Process tracker python 71
2 parents 5066347 + 0be167c commit 002703c

File tree

3 files changed

+179
-34
lines changed

3 files changed

+179
-34
lines changed

process_tracker/location_tracker.py

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# Location
22
# For processes dealing with Extract Locations.
33
import logging
4-
from os.path import basename, normpath
4+
from pathlib import PurePath
5+
from os.path import basename, dirname, join, isdir, normpath
56

67
from process_tracker.utilities.aws_utilities import AwsUtilities
78
from process_tracker.utilities.logging import console
@@ -24,15 +25,15 @@ def __init__(self, location_path, location_name=None, data_store=None):
2425
raise Exception("Data store is not set.")
2526
else:
2627
self.data_store = data_store
28+
self.session = self.data_store.session
2729

2830
self.location_path = location_path.lower()
31+
self.location_name = location_name
32+
self.location_bucket_name = self.determine_location_bucket_name()
2933

3034
if location_name is None:
3135
self.logger.info("Location name not provided. Generating.")
3236
self.location_name = self.derive_location_name()
33-
else:
34-
self.logger.info("Using provided location name: %s" % location_name)
35-
self.location_name = location_name
3637

3738
self.location_type = self.derive_location_type()
3839

@@ -43,10 +44,9 @@ def __init__(self, location_path, location_name=None, data_store=None):
4344
location_name=self.location_name,
4445
location_path=location_path,
4546
location_type_id=self.location_type.location_type_id,
47+
location_bucket_name=self.location_bucket_name,
4648
)
4749

48-
self.location_bucket_name = self.determine_location_bucket_name()
49-
5050
def derive_location_name(self):
5151
"""
5252
If location name is not provided, attempt to derive name from path.
@@ -57,18 +57,50 @@ def derive_location_name(self):
5757

5858
location_prefix = None
5959

60-
location_name = ""
60+
current_name = (
61+
self.session.query(Location)
62+
.filter(Location.location_path == self.location_path)
63+
.first()
64+
)
6165

62-
if "s3" in self.location_path:
63-
# If the path is an S3 Bucket, prefix to name.
64-
self.logger.info("Location appears to be s3 related. Setting prefix.")
65-
location_prefix = "s3"
66+
if current_name is not None:
67+
location_name = current_name.location_name
68+
else:
69+
location_name = ""
70+
71+
if "s3" in self.location_path:
72+
# If the path is an S3 Bucket, prefix to name.
73+
self.logger.info("Location appears to be s3 related. Setting prefix.")
74+
location_prefix = "s3 %s" % self.location_bucket_name
75+
else:
76+
location_prefix = "local"
77+
78+
if location_prefix is not None:
79+
self.logger.info(
80+
"Location prefix provided. Appending to location name."
81+
)
82+
location_name = location_prefix + " - "
6683

67-
if location_prefix is not None:
68-
self.logger.info("Location prefix provided. Appending to location name.")
69-
location_name = location_prefix + " - "
84+
if "." in str(PurePath(self.location_path).name):
85+
location_name += PurePath(self.location_path).parent.name
86+
else:
87+
location_name += PurePath(self.location_path).name
7088

71-
location_name += basename(normpath(self.location_path))
89+
name_count = (
90+
self.session.query(Location)
91+
.filter(Location.location_name.like(location_name + "%"))
92+
.count()
93+
)
94+
95+
if name_count >= 1:
96+
self.logger.info(
97+
"The location name already exists. There are %s instances."
98+
% name_count
99+
)
100+
101+
location_name = "%s - %s" % (location_name, name_count)
102+
103+
self.logger.info("Location name is now %s" % location_name)
72104

73105
return location_name
74106

@@ -106,29 +138,26 @@ def register_file_count(self, file_count):
106138
"""
107139

108140
self.location.location_file_count = file_count
109-
self.data_store.session.commit()
141+
self.session.commit()
110142

111143
def determine_location_bucket_name(self):
112144
"""
113145
If location is of type 's3', then find which bucket the location belongs to.
114146
:return:
115147
"""
116148
self.logger.info("Determining if location is s3.")
117-
if "s3" in self.location_path or "s3" in self.location_name:
149+
if "s3" in self.location_path or (
150+
self.location_name is not None and "s3" in self.location_name
151+
):
118152

119153
self.logger.info("Location is in s3.")
120-
if self.location.location_bucket_name is None:
121-
self.logger.info("Location bucket was not set.")
122-
123-
self.location.location_bucket_name = AwsUtilities().determine_bucket_name(
124-
path=self.location.location_path
125-
)
126-
127-
self.data_store.session.commit()
154+
location_bucket_name = AwsUtilities().determine_bucket_name(
155+
path=self.location_path
156+
)
128157

129158
else:
130-
self.location.location_bucket_name = None
159+
location_bucket_name = None
131160

132-
self.data_store.session.commit()
161+
self.session.commit()
133162

134-
return self.location.location_bucket_name
163+
return location_bucket_name

tests/test_extract_tracker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def test_derive_location_name_local_path(self):
246246
)
247247

248248
given_result = location[0].location_name
249-
expected_result = "extract_dir2"
249+
expected_result = "local - extract_dir2"
250250

251251
self.assertEqual(expected_result, given_result)
252252

@@ -266,7 +266,7 @@ def test_derive_location_name_s3(self):
266266
)
267267

268268
given_result = location[0].location_name
269-
expected_result = "s3 - extract_dir"
269+
expected_result = "s3 test-test - extract_dir"
270270

271271
self.assertEqual(expected_result, given_result)
272272

tests/test_location_tracker.py

Lines changed: 120 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import unittest
22

3+
from process_tracker.models.extract import Location
4+
35
from process_tracker.utilities.data_store import DataStore
46
from process_tracker.location_tracker import LocationTracker
57

@@ -8,16 +10,55 @@ class TestLocationTracker(unittest.TestCase):
810
@classmethod
911
def setUpClass(cls):
1012
cls.data_store = DataStore()
13+
cls.session = cls.data_store.session
14+
15+
@classmethod
16+
def tearDownClass(cls):
17+
cls.session.close()
18+
19+
def tearDown(self):
20+
self.session.query(Location).delete()
21+
self.session.commit()
22+
23+
def test_derive_location_name_no_trailing_slash_local(self):
24+
"""
25+
Testing that if no location name is provided, and it's not a location already, the last directory is set as the
26+
location name even if a trailing slash is not provided.
27+
:return:
28+
"""
29+
test_path = "/tmp/testing/test_dir"
30+
31+
expected_result = "local - test_dir"
32+
given_result = LocationTracker(
33+
location_path=test_path, data_store=self.data_store
34+
).location_name
35+
36+
self.assertEqual(expected_result, given_result)
37+
38+
def test_derive_location_name_no_trailing_slash_s3(self):
39+
"""
40+
Testing that if no location name is provided, and it's not a location already, the last directory is set as the
41+
location name even if a trailing slash is not provided.
42+
:return:
43+
"""
44+
test_path = "s3://tmp/testing/test_dir"
45+
46+
expected_result = "s3 tmp - test_dir"
47+
given_result = LocationTracker(
48+
location_path=test_path, data_store=self.data_store
49+
).location_name
50+
51+
self.assertEqual(expected_result, given_result)
1152

1253
def test_derive_location_name_none(self):
1354
"""
1455
Testing that if no location name is provided, and it's not a location path, the last directory is set as the
1556
location name.
1657
:return:
1758
"""
18-
test_path = "/tmp/testing/test_dir"
59+
test_path = "/tmp/testing/test_dir/"
1960

20-
expected_result = "test_dir"
61+
expected_result = "local - test_dir"
2162
given_result = LocationTracker(
2263
location_path=test_path, data_store=self.data_store
2364
).location_name
@@ -29,9 +70,9 @@ def test_derive_location_name_s3(self):
2970
Testing that if no location name is provided, and it's an s3 location path, the s3 prefix is added.
3071
:return:
3172
"""
32-
test_path = "s3://tmp/testing/test_dir"
73+
test_path = "s3://tmp/testing/test_dir/"
3374

34-
expected_result = "s3 - test_dir"
75+
expected_result = "s3 tmp - test_dir"
3576
given_result = LocationTracker(
3677
location_path=test_path, data_store=self.data_store
3778
).location_name
@@ -121,3 +162,78 @@ def test_determine_location_bucket_name_local(self):
121162
given_result = location.location.location_bucket_name
122163

123164
self.assertEqual(expected_result, given_result)
165+
166+
def test_determine_location_name_duplicate_name_s3(self):
167+
"""
168+
Testing that if two different s3 locations produce the same location name
169+
that the second location will append a number to ensure uniqueness.
170+
:return:
171+
"""
172+
expected_result = "s3 duplicate-test - dir - 1"
173+
174+
location = LocationTracker(
175+
location_path="https://duplicate-test.s3.amazonaws.com/this/is/a/test/dir/file.txt",
176+
data_store=self.data_store,
177+
)
178+
179+
dupe_location = LocationTracker(
180+
location_path="https://duplicate-test.s3.amazonaws.com/this/is/another/test/dir/file.txt",
181+
data_store=self.data_store,
182+
)
183+
184+
given_result = dupe_location.location.location_name
185+
186+
self.assertEqual(expected_result, given_result)
187+
188+
def test_determine_location_name_duplicate_name_local(self):
189+
"""
190+
Testing that if two different s3 locations produce the same location name
191+
that the second location will append a number to ensure uniqueness.
192+
:return:
193+
"""
194+
expected_result = "local - test_dir - 1"
195+
196+
location = LocationTracker(
197+
location_path="/tmp/duplicate_testing/test_dir/file.txt",
198+
data_store=self.data_store,
199+
)
200+
201+
dupe_location = LocationTracker(
202+
location_path="/tmp/duplicate_testing_another/test_dir/file.txt",
203+
data_store=self.data_store,
204+
)
205+
206+
given_result = dupe_location.location.location_name
207+
208+
self.assertEqual(expected_result, given_result)
209+
210+
def test_determine_location_name_file_not_part_s3(self):
211+
"""
212+
Testing that when a s3 path is provided with a filename at the end, the file is ignored.
213+
:return:
214+
"""
215+
expected_result = "s3 test-bucket - dir"
216+
217+
location = LocationTracker(
218+
location_path="https://test-bucket.s3.amazonaws.com/this/is/a/test/dir/file.txt",
219+
data_store=self.data_store,
220+
)
221+
222+
given_result = location.location.location_name
223+
224+
self.assertEqual(expected_result, given_result)
225+
226+
def test_determine_location_name_file_not_part_local(self):
227+
"""
228+
Testing that when a local path is provided with a filename at the end, the file is ignored.
229+
:return:
230+
"""
231+
expected_result = "local - path"
232+
233+
location = LocationTracker(
234+
location_path="/local/dir/path/text.txt", data_store=self.data_store
235+
)
236+
237+
given_result = location.location.location_name
238+
239+
self.assertEqual(expected_result, given_result)

0 commit comments

Comments
 (0)