OpenDataAlex
diff --git a/‎models/extract.py‎
Lines changed: 22 additions & 4 deletions b/‎models/extract.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎models/process.py‎
Lines changed: 1 addition & 1 deletion b/‎models/process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎process_tracker/extract_tracking.py‎
Lines changed: 122 additions & 9 deletions b/‎process_tracker/extract_tracking.py‎
Lines changed: 122 additions & 9 deletions
@@ -1,7 +1,9 @@
 # SQLAlchemy Models
 # Models for Extract (Data) entities
 
-from sqlalchemy import Column, ForeignKey, Integer, Sequence, String
+from datetime import datetime
+
+from sqlalchemy import Column, DateTime, ForeignKey, Integer, Sequence, String
 from sqlalchemy.orm import relationship
 
 from models.model_base import Base
@@ -14,7 +16,7 @@ class ExtractStatus(Base):
     extract_status_id = Column(Integer, Sequence('extract_status_lkup_status_id_seq'), primary_key=True)
     extract_status_name = Column(String(75), nullable=False, unique=True)
 
-    extracts = relationship("Extract")
+    extracts = relationship("ExtractProcess")
 
 
 class Extract(Base):
@@ -25,10 +27,26 @@ class Extract(Base):
     extract_source_id = Column(Integer, ForeignKey("source_lkup.source_id"))
     extract_filename = Column(String(750), nullable=False, unique=True)
     extract_location_id = Column(Integer, ForeignKey('location_lkup.location_id'))
-    extract_process_run_id = Column(Integer, ForeignKey('process_tracking.process_tracking_id'))
+#    extract_process_run_id = Column(Integer, ForeignKey('process_tracking.process_tracking_id'))
     extract_status_id = Column(Integer, ForeignKey('extract_status_lkup.extract_status_id'))
+    extract_registration_date_time = Column(DateTime, nullable=False, default=datetime.now())
+#    extract_load_date_time = Column(DateTime, nullable=False, default=default_date)
+#    extract_archive_date_time = Column(DateTime, nullable=False, default=default_date)
+
+    extract_process = relationship("ExtractProcess", back_populates='process_extracts')
+
+
+class ExtractProcess(Base):
+
+    __tablename__ = "extract_process_tracking"
+
+    extract_tracking_id = Column(Integer, ForeignKey("extract_tracking.extract_id"), primary_key=True)
+    process_tracking_id = Column(Integer, ForeignKey("process_tracking.process_tracking_id"), primary_key=True)
+    extract_process_status_id = Column(Integer, ForeignKey("extract_status_lkup.extract_status_id"))
+    extract_process_event_date_time = Column(DateTime, nullable=False, default=datetime.now())
 
-    process_tracking = relationship("ProcessTracking")
+    process_extracts = relationship('Extract', foreign_keys=[extract_tracking_id])
+    extract_processes = relationship('ProcessTracking', foreign_keys=[process_tracking_id])
 
 
 class Location(Base):
 
@@ -136,7 +136,7 @@ class ProcessTracking(Base):
     is_latest_run = Column(Boolean, nullable=False, default=False)
 
     errors = relationship("ErrorTracking", back_populates="error_tracking")
-    extracts = relationship("Extract", back_populates="process_tracking")
+    extracts = relationship("ExtractProcess", back_populates="extract_processes")
     process = relationship("Process", back_populates="process_tracking")
 
     def __repr__(self):
 
@@ -1,22 +1,135 @@
 # Extract Tracking
 # Used in the creation and editing of extract records.  Used in conjunction with process tracking.
+from datetime import datetime
+from os.path import basename, normpath
 
-from models.extract import Extract, Location
-from models.source import Source
+from process_tracker import session
+from process_tracker.data_store import DataStore
+
+from models.extract import Extract, ExtractProcess, ExtractStatus, Location
 
 
 class ExtractTracker:
 
-    def __init__(self, source_name, filename, location, process_name):
+    def __init__(self, process_run, filename, location_path, location_name=None):
         """
         ExtractTracker is the primary engine for tracking data extracts
-        :param source_name: Name of the source where data extract is from.
-        :type source_name: string
+        :param process_run: The process object working with extracts (either creating or consuming)
+        :type process_run: ProcessTracker object
         :param filename: Name of the data extract file.
         :type filename:  string
-        :param location: Location (filepath, s3 bucket, etc.) where the file is stored
-        :type location: string
-        :param process_name: Name of the process that produced the data extract.
-        :type process_name: string
+        :param location_path: Location (filepath, s3 bucket, etc.) where the file is stored
+        :type location_path: string
+        :param location_name: Optional parameter to provide a specific name for the location.  If not provided, will use
+                              the last directory in the path as the location name.  If type of location can be
+                              determined (i.e. S3 bucket), the location type will be prepended.
+        :type location_name: string
+        """
+        self.data_store = DataStore()
+        self.process_run = process_run
+
+        if location_name is None:
+            location_name = self.derive_location_name(location_path=location_path)
+
+        self.source = self.process_run.source
+        self.filename = filename
+
+        self.location = self.data_store.get_or_create(model=Location
+                                                      , location_name=location_name
+                                                      , location_path=location_path)
+
+        self.extract = self.data_store.get_or_create(model=Extract
+                                                     , extract_filename=filename
+                                                     , extract_location_id=self.location.location_id
+                                                     , extract_source_id=self.source.source_id)
+
+        self.extract_process = self.retrieve_extract_process()
+
+        # Getting all status types in the event there are custom status types added later.
+        self.extract_status_types = self.get_extract_status_types()
+
+        # For specific status types, need to retrieve their ids to be used for those status types' logic.
+
+        self.extract_status_initializing = self.extract_status_types['initializing']
+        self.extract_status_ready = self.extract_status_types['ready']
+        self.extract_status_loading = self.extract_status_types['loading']
+        self.extract_status_loaded = self.extract_status_types['loaded']
+        self.extract_status_archived = self.extract_status_types['archived']
+        self.extract_status_deleted = self.extract_status_types['deleted']
+        self.extract_status_error = self.extract_status_types['error']
+
+    def change_extract_status(self, new_status):
+        """
+        Change an extract record status.
+        :return:
+        """
+        status_date = datetime.now()
+        new_status = self.extract_status_types[new_status]
+
+        if self.extract_status_types[new_status]:
+            self.extract.extract_status_id = new_status
+
+            self.extract_process.extract_status_id = new_status
+            self.extract_process.extract_process_event_date_time = status_date
+
+            session.commit()
+
+        else:
+            raise Exception('%s is not a valid extract status type.  '
+                            'Please add the status to extract_status_lkup' % new_status)
+
+    @staticmethod
+    def derive_location_name(location_path):
         """
+        If location name is not provided, attempt to derive name from path.
+        :param location_path: The data extract file location path.
+        :return:
+        """
+        # Idea is to generalize things like grabbing the last directory name in the path,
+        # what type of path is it (normal, s3, etc.)
+
+        location_path = location_path.lower()  # Don't care about casing.
+
+        if "s3" in location_path:
+            # If the path is an S3 Bucket, prefix to name.
+
+            location_prefix = "s3"
+        else:
+            location_prefix = ""
+
+        location_name = location_prefix + " - "
+
+        location_name += basename(normpath(location_path))
+
+        return location_name
+
+    @staticmethod
+    def get_extract_status_types():
+        """
+        Get list of process status types and return dictionary.
+        :return:
+        """
+        status_types = {}
+
+        for record in session.query(ExtractStatus):
+            status_types[record.extract_status_name] = record.extract_status_id
+
+        return status_types
+
+    def retrieve_extract_process(self):
+        """
+        Create and initialize or retrieve the process/extract relationship.
+        :return:
+        """
+
+        extract_process = self.data_store.get_or_create(model=ExtractProcess
+                                                        , extract_tracking_id=self.extract.extract_id
+                                                        , process_tracking_id=self.process_run.process_tracking_id)
+
+        # Only need to set to 'initializing' when it's the first time a process run is trying to work with files.
+        if extract_process.extract_process_status_id is None:
+
+            extract_process.extract_process_status_id = self.extract_status_initializing
+            session.commit()
 
+        return extract_process