From 2309c78bec17456ec4fb90efd792b19d4301fad5 Mon Sep 17 00:00:00 2001 From: Abhi Suryawanshi Date: Sat, 25 Oct 2025 00:32:44 +0530 Subject: [PATCH 1/2] feat: extend extract() to support Excel and JSON while preserving CSV functionality - Added file type detection using os.path.splitext() - Added support for .xlsx, .xls, and .json - Implemented format-specific reading logic - Enhanced error handling for unsupported formats - Preserved CSV functionality including multi-encoding support - Added 'openpyxl>=3.0.0' to requirements --- app/etl/extract.py | 70 ++++++++++++++++++++++++++++++-------------- app/requirements.txt | 1 + 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/app/etl/extract.py b/app/etl/extract.py index 72ad0db..68dee67 100644 --- a/app/etl/extract.py +++ b/app/etl/extract.py @@ -3,50 +3,76 @@ # TODO (Find & Fix) from typing import Optional -def extract(path: str = "xyz.csv") -> pd.DataFrame : +def extract(path: str = "xyz.csv") -> pd.DataFrame: """ - Extracts data from CSV file. + Extracts data from CSV, Excel, or JSON file. Args: - path: Path to the CSV file + path: Path to the data file (supports .csv, .xlsx, .json) Returns: - DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring + pd.DataFrame: DataFrame containing the extracted data Raises: FileNotFoundError: If the file doesn't exist - ValueError: If the file is empty or invalid + ValueError: If the file format is unsupported or file is empty/invalid """ # Validate file path if not os.path.exists(path): raise FileNotFoundError(f"❌ File not found: {path}") - if not path.lower().endswith('.csv'): # TODO (Find & Fix) - raise ValueError(f"❌ File must be a CSV: {path}") + # Get file extension + file_ext = os.path.splitext(path)[-1].lower() + + # Check if file format is supported + supported_formats = ['.csv', '.xlsx', '.xls', '.json'] + if file_ext not in supported_formats: + raise ValueError(f"❌ Unsupported file format: {file_ext}. Supported formats: {supported_formats}") try: - # Try different encodings - encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] df = None - for encoding in encodings: + if file_ext == '.csv': + # Try different encodings for CSV files + encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] + + for encoding in encodings: + try: + df = pd.read_csv(path, encoding=encoding) + print(f"Successfully read CSV with encoding: {encoding}") + break + except UnicodeDecodeError: + print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed + continue + except Exception as e: + print(f"Error reading with encoding '{encoding}': {e}") + continue + + if df is None: + raise ValueError(f"Could not read CSV with tried encodings: {encodings}") + + elif file_ext in ['.xlsx', '.xls']: + # Read Excel files + try: + df = pd.read_excel(path) + print(f"Successfully read Excel file: {path}") + except Exception as e: + raise ValueError(f"❌ Error reading Excel file: {e}") + + elif file_ext == '.json': + # Read JSON files try: - df = pd.read_csv(path, encoding=encoding) - print(f"Successfully read CSV with encoding: {encoding}") - break - except UnicodeDecodeError: - print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed - continue + df = pd.read_json(path) + print(f"Successfully read JSON file: {path}") except Exception as e: - print(f"Error reading with encoding '{encoding}': {e}") - continue + raise ValueError(f"❌ Error reading JSON file: {e}") + # Validate data if df is None: - raise ValueError(f" Could not read CSV with tried encodings: {encodings}") + raise ValueError("❌ Failed to read data from file") - # Validate data if df.empty: - raise ValueError("File contains no data") + raise ValueError("❌ File contains no data") print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") # TODO: Use logging instead of print return df @@ -54,6 +80,6 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame : except pd.errors.EmptyDataError: raise ValueError("❌ File contains no data") except pd.errors.ParserError as e: - raise ValueError(f"❌ Error parsing CSV: {e}") + raise ValueError(f"❌ Error parsing file: {e}") except Exception as e: raise ValueError(f"❌ Unexpected error reading file: {e}") \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt index 6951741..fc36e3c 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1 +1,2 @@ pandas>=2.0.0 +openpyxl>=3.0.0 From d814af4293d49e0e1d7e6e3e313dd0283591a43b Mon Sep 17 00:00:00 2001 From: Abhi Suryawanshi Date: Sat, 25 Oct 2025 01:13:58 +0530 Subject: [PATCH 2/2] fix(extract): handle Excel and JSON file formats --- app/etl/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/etl/extract.py b/app/etl/extract.py index 68dee67..caa6baa 100644 --- a/app/etl/extract.py +++ b/app/etl/extract.py @@ -72,7 +72,7 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame: raise ValueError("❌ Failed to read data from file") if df.empty: - raise ValueError("❌ File contains no data") + raise ValueError(" File contains no data") print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") # TODO: Use logging instead of print return df