guillaume-chervet · guillaume-chervet · Nov 30, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
diff --git a/.github/workflows/ci.yml → .github/workflows/main.yml b/.github/workflows/ci.yml → .github/workflows/main.yml
@@ -193,7 +193,8 @@ jobs:
           poetry run python azureml_run_pipeline.py \
             --subscription_id ${{ env.AZURE_SUBSCRIPTION_ID }}  \
             --resource_group_name ${{ env.AZURE_RESOURCE_GROUP_NAME }} \
-            --workspace_name ${{ env.AZURE_ML_WORKSPACE_NAME }} > train_output.txt
+            --workspace_name ${{ env.AZURE_ML_WORKSPACE_NAME }} \
+            --tags "{\"git\":\"${{ github.head_ref }}.${{ github.sha }}\",\"version\":\"${{ needs.tags.outputs.new_version }}\",\"triggering_actor\":\"${{github.triggering_actor}}\"}"  > train_output.txt
           cat train_output.txt
       working-directory: train
     - name: download model

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # MLOpsPython
 
-A real demo of Deep Learning project with preprocessing from development to production using code, ops and Machine Learning best practices. Production is a real time REST API.
+A real demo of Deep Learning project with preprocessing from development to production using code, ops and Machine Learning best practices. Production is a real time REST API. 
 
 ![project_workflow.png](documentation%2Fproject_workflow.png)
 

diff --git a/documentation/step_0_setup.md b/documentation/step_0_setup.md
@@ -130,7 +130,7 @@ jobs:
       run: |
         pipenv install --dev
         pipenv run flake8 .
-' > python-ci.yml
+' > python-main.yml
 ```
 
 Now we can protect your "main" branch on github. Add a security constrain, that your code must pass the CI before merging.

diff --git a/train/azureml_run_pipeline.py b/train/azureml_run_pipeline.py
@@ -9,21 +9,26 @@
 from azure.ai.ml.entities import Data
 from azure.ai.ml.entities import AmlCompute
 
+from extraction import register_extracted_dataset
+
 import uuid
 
 import json
 
+
 parser = argparse.ArgumentParser("train")
 parser.add_argument("--subscription_id", type=str)
 parser.add_argument("--resource_group_name", type=str)
 parser.add_argument("--workspace_name", type=str)
+parser.add_argument("--tags", type=str, default="{}")
+
 
 args = parser.parse_args()
 subscription_id = args.subscription_id
 resource_group_name = args.resource_group_name
 workspace_name = args.workspace_name
-
-URI_FOLDER = "uri_folder"
+print(args.tags)
+tags = json.loads(args.tags)
 
 try:
     credential = DefaultAzureCredential()
@@ -34,8 +39,6 @@
     # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
     credential = InteractiveBrowserCredential()
 
-
-
 # Get a handle to workspace
 ml_client = MLClient(
     credential=credential,
@@ -51,7 +54,7 @@
     name=cluster_name,
     type="amlcompute",
     size="Standard_D4s_v3",
-    location="northeurope", #az account list-locations -o table
+    location="northeurope",  # az account list-locations -o table
     min_instances=0,
     max_instances=1,
     idle_time_before_scale_down=60,
@@ -60,103 +63,111 @@
 
 
 @pipeline(default_compute=cluster_name)
-def azureml_pipeline(pdfs_input_data: Input(type=URI_FOLDER),
-                     labels_input_data: Input(type=URI_FOLDER)):
+def azureml_pipeline(
+    pdfs_input_data: Input(type=AssetTypes.URI_FOLDER),
+    labels_input_data: Input(type=AssetTypes.URI_FOLDER),
+):
     extraction_step = load_component(source="extraction/command.yaml")
-    extraction = extraction_step(
-        pdfs_input=pdfs_input_data
-    )
+    extraction = extraction_step(pdfs_input=pdfs_input_data)
 
     label_split_data_step = load_component(source="label_split_data/command.yaml")
-    label_split_data = label_split_data_step(labels_input=labels_input_data,
-                                             pdfs_input=pdfs_input_data,
-                                             images_input=extraction.outputs.images_output)
+    label_split_data = label_split_data_step(
+        labels_input=labels_input_data,
+        pdfs_input=pdfs_input_data,
+        images_input=extraction.outputs.images_output,
+    )
 
     train_step = load_component(source="train/command.yaml")
     train_data = train_step(
-        split_images_input=label_split_data.outputs.split_images_output)
+        split_images_input=label_split_data.outputs.split_images_output
+    )
 
     test_step = load_component(source="test/command.yaml")
-    test_data = test_step(model_input=train_data.outputs.model_output,
-                          integration_input=label_split_data.outputs.split_integration_output,
-                          images_input=label_split_data.outputs.split_images_output)
+    test_data = test_step(
+        model_input=train_data.outputs.model_output,
+        integration_input=label_split_data.outputs.split_integration_output,
+        images_input=label_split_data.outputs.split_images_output,
+    )
 
     return {
         "extraction_output": extraction.outputs.images_output,
+        "extraction_hash_output": extraction.outputs.hash_output,
         "model_output": test_data.outputs.model_output,
         "integration_output": test_data.outputs.integration_output,
     }
 
 
 pipeline_job = azureml_pipeline(
     pdfs_input_data=Input(
-        path="azureml:cats_dogs_others:1", type=URI_FOLDER
+        path="azureml:cats_dogs_others:1", type=AssetTypes.URI_FOLDER
     ),
     labels_input_data=Input(
-        path="azureml:cats_dogs_others_labels:1", type=URI_FOLDER
-    )
+        path="azureml:cats_dogs_others_labels:1", type=AssetTypes.URI_FOLDER
+    ),
 )
-
+pipeline_job.settings.force_rerun = False
 
 azure_blob = "azureml://datastores/workspaceblobstore/paths/"
 experiment_id = str(uuid.uuid4())
-custom_extraction_path = azure_blob + "extraction/cats-dogs-others/" + experiment_id + "/"
-pipeline_job.outputs.model_output = Output(
-    type=URI_FOLDER, mode="rw_mount", path=custom_extraction_path
+custom_extraction_path = (
+    azure_blob + "extraction/cats-dogs-others/" + experiment_id + "/"
+)
+pipeline_job.outputs.extraction_output = Output(
+    type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_extraction_path
+)
+custom_extraction_hash_path = (
+    azure_blob + "extraction_hash/cats-dogs-others/" + experiment_id + "/"
+)
+pipeline_job.outputs.extraction_hash_output = Output(
+    type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_extraction_hash_path
 )
 custom_model_path = azure_blob + "models/cats-dogs-others/" + experiment_id + "/"
 pipeline_job.outputs.model_output = Output(
-    type=URI_FOLDER, mode="rw_mount", path=custom_model_path
+    type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_model_path
+)
+custom_integration_path = (
+    azure_blob + "integration/cats-dogs-others/" + experiment_id + "/"
 )
-custom_integration_path = azure_blob + "/integration/cats-dogs-others/" + experiment_id + "/"
 pipeline_job.outputs.integration_output = Output(
-    type=URI_FOLDER, mode="rw_mount", path=custom_integration_path
+    type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_integration_path
 )
 
 pipeline_job = ml_client.jobs.create_or_update(
-    pipeline_job, experiment_name="cats_dos_others_pipeline"
+    pipeline_job, experiment_name="cats_dos_others_pipeline", tags=tags
 )
 
 ml_client.jobs.stream(pipeline_job.name)
 
-
-integration_dataset = Data(
-    name="cats-dogs-others-extraction",
-    path=custom_integration_path,
-    type=URI_FOLDER,
-    description="Extracted images for cats and dogs and others",
-    version="1",
-    tags={"source_type": "web", "source": "UCI ML Repo"},
-)
-integration_dataset = ml_client.data.create_or_update(integration_dataset)
-print(
-    f"Dataset with name {integration_dataset.name} was registered to workspace, the dataset version is {integration_dataset.version}"
+register_extracted_dataset(
+    ml_client, custom_extraction_hash_path, custom_extraction_path, tags
 )
 
-
 model_name = "cats-dogs-others"
 try:
     model_version = str(len(list(ml_client.models.list(model_name))) + 1)
 except:
     model_version = "1"
 
 file_model = Model(
-        version=model_version,
-        path=custom_model_path,
-        type=AssetTypes.CUSTOM_MODEL,
-        name=model_name,
-        description="Model created from azureML.",
-    )
+    version=model_version,
+    path=custom_model_path,
+    type=AssetTypes.CUSTOM_MODEL,
+    name=model_name,
+    description="Model created from azureML.",
+    tags={**tags},
+)
 saved_model = ml_client.models.create_or_update(file_model)
 
-print(f"Model with name {saved_model.name} was registered to workspace, the model version is {saved_model.version}.")
+print(
+    f"Model with name {saved_model.name} was registered to workspace, the model version is {saved_model.version}."
+)
 
 integration_dataset = Data(
     name="cats-dogs-others-integration",
     path=custom_integration_path,
-    type=URI_FOLDER,
+    type=AssetTypes.CUSTOM_MODEL,
     description="Integration dataset for cats and dogs and others",
-    tags={"source_type": "web", "source": "UCI ML Repo"},
+    tags=tags,
 )
 integration_dataset = ml_client.data.create_or_update(integration_dataset)
 print(

diff --git a/train/extraction.py b/train/extraction.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+import azure.ai.ml._artifacts._artifact_utilities as artifact_utils
+from azure.ai.ml.constants import AssetTypes
+from azure.ai.ml.entities import Data
+
+def register_extracted_dataset(ml_client,
+                                        custom_extraction_hash_path:str,
+                                        custom_extraction_path:str,
+                                        tags:dict):
+    BASE_PATH = Path(__file__).resolve().parent
+    artifact_utils.download_artifact_from_aml_uri(
+        uri=custom_extraction_hash_path,
+        destination=str(BASE_PATH),
+        datastore_operation=ml_client.datastores,
+    )
+
+    # lire le fichier hash.txt qui est dans BASE_PATH
+    with open(str(BASE_PATH / "hash.txt"), "r") as file:
+        computed_hash = file.read()
+    print(f"computed_hash: {computed_hash}")
+
+    extracted_images_dataset_name = "cats-dogs-others-extracted"
+    try:
+        list_datasets = ml_client.data.list(extracted_images_dataset_name)
+        list_list_datset = list(list_datasets)
+        version_dataset_extraction = len(list_list_datset) + 1
+    except:
+        list_list_datset = []
+        version_dataset_extraction = 1
+        print("No dataset with name cats-dogs-others-extracted")
+
+    hash_tag_already_exists = False
+    len_dataset = len(list_list_datset)
+    if len_dataset > 0:
+        dataset = list_list_datset[len_dataset - 1]
+        print(f"dataset.tags: {str(dataset.version)}")
+        print(dataset.tags)
+        if "hash" in dataset.tags:
+            extracted_images_dataset_version = dataset.tags["hash"]
+            print(f"extracted_images_dataset_version: {extracted_images_dataset_version}")
+            print(f"computed_hash: {computed_hash}")
+            if extracted_images_dataset_version == computed_hash:
+                hash_tag_already_exists = True
+
+    if not hash_tag_already_exists:
+        extracted_images_dataset = Data(
+            name=extracted_images_dataset_name,
+            path=custom_extraction_path,
+            type=AssetTypes.CUSTOM_MODEL,
+            description="Extracted images for cats and dogs and others",
+            version=str(version_dataset_extraction),
+            tags={"hash": computed_hash, **tags},
+        )
+        extracted_images_dataset = ml_client.data.create_or_update(extracted_images_dataset)
+        print(
+            f"Dataset with name {extracted_images_dataset.name} was registered to workspace, the dataset version is {extracted_images_dataset.version}"
+        )
diff --git a/train/extraction/command.py b/train/extraction/command.py
@@ -1,15 +1,30 @@
 import argparse
+from pathlib import Path
+
 from mlopspython_extraction.extraction import extract_images
 import mlflow
 
+from directory_hash import hash_dir
+
 parser = argparse.ArgumentParser("extraction")
 parser.add_argument("--pdfs_input", type=str)
 parser.add_argument("--images_output", type=str)
+parser.add_argument("--hash_output", type=str)
 
 args = parser.parse_args()
 pdfs_input = args.pdfs_input
 images_output = args.images_output
+hash_output = args.hash_output
 
 result = extract_images(pdfs_input, images_output)
+computed_hash = hash_dir(images_output)
+with open(str(Path(hash_output) / "hash.txt"), "w") as file:
+    file.write(computed_hash)
+
+console_output = f""" 
+    number_files_input: {result.number_files_input}
+    number_images_output: {result.number_images_output}
+    computed_hash: {computed_hash}
+"""
 mlflow.log_metric("number_files_input", result.number_files_input)
 mlflow.log_metric("number_images_output", result.number_images_output)
diff --git a/train/extraction/command.yaml b/train/extraction/command.yaml
@@ -15,6 +15,8 @@ inputs:
 outputs:
   images_output:
     type: uri_folder
+  hash_output:
+    type: uri_folder
 
 environment:
   name: extraction_environment
@@ -26,4 +28,5 @@ command: >-
   python command.py
   --pdfs_input ${{inputs.pdfs_input}}
   --images_output ${{outputs.images_output}}
+  --hash_output ${{outputs.hash_output}}
 
diff --git a/train/extraction/directory_hash.py b/train/extraction/directory_hash.py
@@ -0,0 +1,32 @@
+import os
+import hashlib
+import sys
+
+
+# python 3.11
+# def sha256sum(filename):
+#    with open(filename, 'rb', buffering=0) as file:
+#        return hashlib.file_digest(file, 'sha1').hexdigest()
+
+# Less than python 3.11 < 3
+def sha256sum(filename):
+    h = hashlib.sha256()
+    b = bytearray(128 * 1024)
+    mv = memoryview(b)
+    with open(filename, 'rb', buffering=0) as f:
+        while n := f.readinto(mv):
+            h.update(mv[:n])
+    return h.hexdigest()
+
+
+def hash_dir(dir_path):
+    hashes = []
+    for path, dirs, files in os.walk(dir_path):
+        for file in sorted(files):  # we sort to guarantee that files will always go in the same order
+            hashes.append(sha256sum(os.path.join(path, file)))
+        for dir in sorted(dirs):  # we sort to guarantee that dirs will always go in the same order
+            hashes.append(hash_dir(os.path.join(path, dir)))
+        break  # we only need one iteration - to get files and dirs in current directory
+    h = hashlib.new('sha1')  # sha256 can be replaced with diffrent algorithms
+    h.update(''.join(hashes).encode())  # give a encoded string. Makes the String to the Hash
+    return str(h.hexdigest())
diff --git a/train/labelling/.gitignore b/train/labelling/.gitignore
@@ -0,0 +1,2 @@
+.idea/
+dataset