Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5d5b54b
feat(dataset):tests
guillaume-chervet Nov 30, 2023
5c44d28
feat(dataset):tests
guillaume-chervet Dec 1, 2023
eca76a1
feat(dataset):tests
guillaume-chervet Dec 1, 2023
28a3065
feat(dataset):tests
guillaume-chervet Dec 1, 2023
161ae03
feat(dataset):tests
guillaume-chervet Dec 1, 2023
e20e6ac
test
guillaume-chervet Dec 1, 2023
04d082d
update
guillaume-chervet Dec 1, 2023
a1599e4
update
guillaume-chervet Dec 1, 2023
d38aaab
update
guillaume-chervet Dec 1, 2023
3828447
update
guillaume-chervet Dec 1, 2023
6421fcb
update
guillaume-chervet Dec 1, 2023
b0013cf
update
guillaume-chervet Dec 1, 2023
954a3c8
update
guillaume-chervet Dec 1, 2023
5d58353
Update command.py
guillaume-chervet Dec 1, 2023
924e55a
Update command.py
guillaume-chervet Dec 2, 2023
0fbc04b
update
guillaume-chervet Dec 2, 2023
97dd213
update
guillaume-chervet Dec 2, 2023
48dffaf
update
guillaume-chervet Dec 2, 2023
c95054b
update
guillaume-chervet Dec 2, 2023
bf78f65
update
guillaume-chervet Dec 2, 2023
71b53e0
update
guillaume-chervet Dec 2, 2023
fb996b8
Update azureml_run_pipeline.py
guillaume-chervet Dec 3, 2023
34c8c5f
update
guillaume-chervet Dec 3, 2023
410bc31
update
guillaume-chervet Dec 3, 2023
385ae47
Update README.md
guillaume-chervet Dec 3, 2023
462508e
add tags
guillaume-chervet Dec 3, 2023
6b2b0c2
Update azureml_run_pipeline.py
guillaume-chervet Dec 4, 2023
ec1b0c5
add tags
guillaume-chervet Dec 4, 2023
1078c58
add tags
guillaume-chervet Dec 5, 2023
bf189b1
Update ci.yml
guillaume-chervet Dec 5, 2023
74a7bd7
Update azureml_run_pipeline.py
guillaume-chervet Dec 6, 2023
ecad425
Update ci.yml
guillaume-chervet Dec 6, 2023
10be5bf
add tags
guillaume-chervet Dec 7, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml → .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ jobs:
poetry run python azureml_run_pipeline.py \
--subscription_id ${{ env.AZURE_SUBSCRIPTION_ID }} \
--resource_group_name ${{ env.AZURE_RESOURCE_GROUP_NAME }} \
--workspace_name ${{ env.AZURE_ML_WORKSPACE_NAME }} > train_output.txt
--workspace_name ${{ env.AZURE_ML_WORKSPACE_NAME }} \
--tags "{\"git\":\"${{ github.head_ref }}.${{ github.sha }}\",\"version\":\"${{ needs.tags.outputs.new_version }}\",\"triggering_actor\":\"${{github.triggering_actor}}\"}" > train_output.txt
cat train_output.txt
working-directory: train
- name: download model
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MLOpsPython

A real demo of Deep Learning project with preprocessing from development to production using code, ops and Machine Learning best practices. Production is a real time REST API.
A real demo of Deep Learning project with preprocessing from development to production using code, ops and Machine Learning best practices. Production is a real time REST API.

![project_workflow.png](documentation%2Fproject_workflow.png)

Expand Down
2 changes: 1 addition & 1 deletion documentation/step_0_setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ jobs:
run: |
pipenv install --dev
pipenv run flake8 .
' > python-ci.yml
' > python-main.yml
```

Now we can protect your "main" branch on github. Add a security constrain, that your code must pass the CI before merging.
Expand Down
111 changes: 61 additions & 50 deletions train/azureml_run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,26 @@
from azure.ai.ml.entities import Data
from azure.ai.ml.entities import AmlCompute

from extraction import register_extracted_dataset

import uuid

import json


parser = argparse.ArgumentParser("train")
parser.add_argument("--subscription_id", type=str)
parser.add_argument("--resource_group_name", type=str)
parser.add_argument("--workspace_name", type=str)
parser.add_argument("--tags", type=str, default="{}")


args = parser.parse_args()
subscription_id = args.subscription_id
resource_group_name = args.resource_group_name
workspace_name = args.workspace_name

URI_FOLDER = "uri_folder"
print(args.tags)
tags = json.loads(args.tags)

try:
credential = DefaultAzureCredential()
Expand All @@ -34,8 +39,6 @@
# Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
credential = InteractiveBrowserCredential()



# Get a handle to workspace
ml_client = MLClient(
credential=credential,
Expand All @@ -51,7 +54,7 @@
name=cluster_name,
type="amlcompute",
size="Standard_D4s_v3",
location="northeurope", #az account list-locations -o table
location="northeurope", # az account list-locations -o table
min_instances=0,
max_instances=1,
idle_time_before_scale_down=60,
Expand All @@ -60,103 +63,111 @@


@pipeline(default_compute=cluster_name)
def azureml_pipeline(pdfs_input_data: Input(type=URI_FOLDER),
labels_input_data: Input(type=URI_FOLDER)):
def azureml_pipeline(
pdfs_input_data: Input(type=AssetTypes.URI_FOLDER),
labels_input_data: Input(type=AssetTypes.URI_FOLDER),
):
extraction_step = load_component(source="extraction/command.yaml")
extraction = extraction_step(
pdfs_input=pdfs_input_data
)
extraction = extraction_step(pdfs_input=pdfs_input_data)

label_split_data_step = load_component(source="label_split_data/command.yaml")
label_split_data = label_split_data_step(labels_input=labels_input_data,
pdfs_input=pdfs_input_data,
images_input=extraction.outputs.images_output)
label_split_data = label_split_data_step(
labels_input=labels_input_data,
pdfs_input=pdfs_input_data,
images_input=extraction.outputs.images_output,
)

train_step = load_component(source="train/command.yaml")
train_data = train_step(
split_images_input=label_split_data.outputs.split_images_output)
split_images_input=label_split_data.outputs.split_images_output
)

test_step = load_component(source="test/command.yaml")
test_data = test_step(model_input=train_data.outputs.model_output,
integration_input=label_split_data.outputs.split_integration_output,
images_input=label_split_data.outputs.split_images_output)
test_data = test_step(
model_input=train_data.outputs.model_output,
integration_input=label_split_data.outputs.split_integration_output,
images_input=label_split_data.outputs.split_images_output,
)

return {
"extraction_output": extraction.outputs.images_output,
"extraction_hash_output": extraction.outputs.hash_output,
"model_output": test_data.outputs.model_output,
"integration_output": test_data.outputs.integration_output,
}


pipeline_job = azureml_pipeline(
pdfs_input_data=Input(
path="azureml:cats_dogs_others:1", type=URI_FOLDER
path="azureml:cats_dogs_others:1", type=AssetTypes.URI_FOLDER
),
labels_input_data=Input(
path="azureml:cats_dogs_others_labels:1", type=URI_FOLDER
)
path="azureml:cats_dogs_others_labels:1", type=AssetTypes.URI_FOLDER
),
)

pipeline_job.settings.force_rerun = False

azure_blob = "azureml://datastores/workspaceblobstore/paths/"
experiment_id = str(uuid.uuid4())
custom_extraction_path = azure_blob + "extraction/cats-dogs-others/" + experiment_id + "/"
pipeline_job.outputs.model_output = Output(
type=URI_FOLDER, mode="rw_mount", path=custom_extraction_path
custom_extraction_path = (
azure_blob + "extraction/cats-dogs-others/" + experiment_id + "/"
)
pipeline_job.outputs.extraction_output = Output(
type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_extraction_path
)
custom_extraction_hash_path = (
azure_blob + "extraction_hash/cats-dogs-others/" + experiment_id + "/"
)
pipeline_job.outputs.extraction_hash_output = Output(
type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_extraction_hash_path
)
custom_model_path = azure_blob + "models/cats-dogs-others/" + experiment_id + "/"
pipeline_job.outputs.model_output = Output(
type=URI_FOLDER, mode="rw_mount", path=custom_model_path
type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_model_path
)
custom_integration_path = (
azure_blob + "integration/cats-dogs-others/" + experiment_id + "/"
)
custom_integration_path = azure_blob + "/integration/cats-dogs-others/" + experiment_id + "/"
pipeline_job.outputs.integration_output = Output(
type=URI_FOLDER, mode="rw_mount", path=custom_integration_path
type=AssetTypes.URI_FOLDER, mode="rw_mount", path=custom_integration_path
)

pipeline_job = ml_client.jobs.create_or_update(
pipeline_job, experiment_name="cats_dos_others_pipeline"
pipeline_job, experiment_name="cats_dos_others_pipeline", tags=tags
)

ml_client.jobs.stream(pipeline_job.name)


integration_dataset = Data(
name="cats-dogs-others-extraction",
path=custom_integration_path,
type=URI_FOLDER,
description="Extracted images for cats and dogs and others",
version="1",
tags={"source_type": "web", "source": "UCI ML Repo"},
)
integration_dataset = ml_client.data.create_or_update(integration_dataset)
print(
f"Dataset with name {integration_dataset.name} was registered to workspace, the dataset version is {integration_dataset.version}"
register_extracted_dataset(
ml_client, custom_extraction_hash_path, custom_extraction_path, tags
)


model_name = "cats-dogs-others"
try:
model_version = str(len(list(ml_client.models.list(model_name))) + 1)
except:
model_version = "1"

file_model = Model(
version=model_version,
path=custom_model_path,
type=AssetTypes.CUSTOM_MODEL,
name=model_name,
description="Model created from azureML.",
)
version=model_version,
path=custom_model_path,
type=AssetTypes.CUSTOM_MODEL,
name=model_name,
description="Model created from azureML.",
tags={**tags},
)
saved_model = ml_client.models.create_or_update(file_model)

print(f"Model with name {saved_model.name} was registered to workspace, the model version is {saved_model.version}.")
print(
f"Model with name {saved_model.name} was registered to workspace, the model version is {saved_model.version}."
)

integration_dataset = Data(
name="cats-dogs-others-integration",
path=custom_integration_path,
type=URI_FOLDER,
type=AssetTypes.CUSTOM_MODEL,
description="Integration dataset for cats and dogs and others",
tags={"source_type": "web", "source": "UCI ML Repo"},
tags=tags,
)
integration_dataset = ml_client.data.create_or_update(integration_dataset)
print(
Expand Down
57 changes: 57 additions & 0 deletions train/extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pathlib import Path
import azure.ai.ml._artifacts._artifact_utilities as artifact_utils
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Data

def register_extracted_dataset(ml_client,
custom_extraction_hash_path:str,
custom_extraction_path:str,
tags:dict):
BASE_PATH = Path(__file__).resolve().parent
artifact_utils.download_artifact_from_aml_uri(
uri=custom_extraction_hash_path,
destination=str(BASE_PATH),
datastore_operation=ml_client.datastores,
)

# lire le fichier hash.txt qui est dans BASE_PATH
with open(str(BASE_PATH / "hash.txt"), "r") as file:
computed_hash = file.read()
print(f"computed_hash: {computed_hash}")

extracted_images_dataset_name = "cats-dogs-others-extracted"
try:
list_datasets = ml_client.data.list(extracted_images_dataset_name)
list_list_datset = list(list_datasets)
version_dataset_extraction = len(list_list_datset) + 1
except:
list_list_datset = []
version_dataset_extraction = 1
print("No dataset with name cats-dogs-others-extracted")

hash_tag_already_exists = False
len_dataset = len(list_list_datset)
if len_dataset > 0:
dataset = list_list_datset[len_dataset - 1]
print(f"dataset.tags: {str(dataset.version)}")
print(dataset.tags)
if "hash" in dataset.tags:
extracted_images_dataset_version = dataset.tags["hash"]
print(f"extracted_images_dataset_version: {extracted_images_dataset_version}")
print(f"computed_hash: {computed_hash}")
if extracted_images_dataset_version == computed_hash:
hash_tag_already_exists = True

if not hash_tag_already_exists:
extracted_images_dataset = Data(
name=extracted_images_dataset_name,
path=custom_extraction_path,
type=AssetTypes.CUSTOM_MODEL,
description="Extracted images for cats and dogs and others",
version=str(version_dataset_extraction),
tags={"hash": computed_hash, **tags},
)
extracted_images_dataset = ml_client.data.create_or_update(extracted_images_dataset)
print(
f"Dataset with name {extracted_images_dataset.name} was registered to workspace, the dataset version is {extracted_images_dataset.version}"
)
15 changes: 15 additions & 0 deletions train/extraction/command.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
import argparse
from pathlib import Path

from mlopspython_extraction.extraction import extract_images
import mlflow

from directory_hash import hash_dir

parser = argparse.ArgumentParser("extraction")
parser.add_argument("--pdfs_input", type=str)
parser.add_argument("--images_output", type=str)
parser.add_argument("--hash_output", type=str)

args = parser.parse_args()
pdfs_input = args.pdfs_input
images_output = args.images_output
hash_output = args.hash_output

result = extract_images(pdfs_input, images_output)
computed_hash = hash_dir(images_output)
with open(str(Path(hash_output) / "hash.txt"), "w") as file:
file.write(computed_hash)

console_output = f"""
number_files_input: {result.number_files_input}
number_images_output: {result.number_images_output}
computed_hash: {computed_hash}
"""
mlflow.log_metric("number_files_input", result.number_files_input)
mlflow.log_metric("number_images_output", result.number_images_output)
3 changes: 3 additions & 0 deletions train/extraction/command.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ inputs:
outputs:
images_output:
type: uri_folder
hash_output:
type: uri_folder

environment:
name: extraction_environment
Expand All @@ -26,4 +28,5 @@ command: >-
python command.py
--pdfs_input ${{inputs.pdfs_input}}
--images_output ${{outputs.images_output}}
--hash_output ${{outputs.hash_output}}

32 changes: 32 additions & 0 deletions train/extraction/directory_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import hashlib
import sys


# python 3.11
# def sha256sum(filename):
# with open(filename, 'rb', buffering=0) as file:
# return hashlib.file_digest(file, 'sha1').hexdigest()

# Less than python 3.11 < 3
def sha256sum(filename):
h = hashlib.sha256()
b = bytearray(128 * 1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
while n := f.readinto(mv):
h.update(mv[:n])
return h.hexdigest()


def hash_dir(dir_path):
hashes = []
for path, dirs, files in os.walk(dir_path):
for file in sorted(files): # we sort to guarantee that files will always go in the same order
hashes.append(sha256sum(os.path.join(path, file)))
for dir in sorted(dirs): # we sort to guarantee that dirs will always go in the same order
hashes.append(hash_dir(os.path.join(path, dir)))
break # we only need one iteration - to get files and dirs in current directory
h = hashlib.new('sha1') # sha256 can be replaced with diffrent algorithms
h.update(''.join(hashes).encode()) # give a encoded string. Makes the String to the Hash
return str(h.hexdigest())
2 changes: 2 additions & 0 deletions train/labelling/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/
dataset
Loading