From 48f002d7031d5058add2dfb1315e6ac3712e3036 Mon Sep 17 00:00:00 2001
From: NihalNawaz <nnk00005@students.stir.ac.uk>
Date: Tue, 2 Jun 2026 00:06:30 +0200
Subject: [PATCH] Complete ETL Pipeline (Advanced Level)

---
 app.py                        | 122 +++++++++----
 execution_evidence.ipynb      | 315 ++++++++++++++++++++++++++++++++++
 www/services/__init__.py      |   5 +-
 www/services/api_retriever.py |  85 +++++++++
 www/services/standardizer.py  | 161 +++++++++++++++++
 www/services/validator.py     |  53 ++++++
 6 files changed, 708 insertions(+), 33 deletions(-)
 create mode 100644 execution_evidence.ipynb
 create mode 100644 www/services/api_retriever.py
 create mode 100644 www/services/standardizer.py
 create mode 100644 www/services/validator.py

diff --git a/app.py b/app.py
index f0891f894..79a880f70 100644
--- a/app.py
+++ b/app.py
@@ -47,11 +47,11 @@
 
 
 # Import necessary libraries for better performance - avoid importing everything
-import tempfile
 import os
 import requests
 import functools
 from datetime import datetime
+from pathlib import Path
 import pandas as pd
 import io
 from functions import *
@@ -64,9 +64,8 @@
 from shinywidgets import render_widget
 from shiny.express import ui, input, render
 
-# Setup the Directory for static assets - optimized for performance
-base_dir = tempfile.gettempdir()  # Use system temp dir instead of creating new temp file
-express.app_opts(static_assets=base_dir, debug=False)
+# Setup the directory for static assets relative to the app file.
+app_root = Path(__file__).resolve().parent
 
 # --- Toggle button ---
 # This button toggles the visibility of the sidebar(s) in the UI.
@@ -81,7 +80,7 @@
 
 # --- UI and UX experience ---
 # Include custom CSS for the app's appearance.
-ui.include_css("www/static/biblioshiny.css")
+ui.include_css(app_root / "www/static/biblioshiny.css")
 
 # --- Header ---
 # The header bar contains the logo, app name, and a set of dropdown menus for notifications, help, donations, and credits.
@@ -252,29 +251,31 @@ def get_latest_cran_version():
 
         # --- Welcome/Info Page ---
         with ui.nav_panel("None", value="info"):
-            ui.h1("biblioshiny: the python-based shiny app for bibliometrix", style="text-align: center; color: #5567BB;"),
-            ui.div(
-                ui.img(src="https://www.bibliometrix.org/logo_new.png", class_="logo", width="400px"),
-                style="text-align: center;"
-            ),
-            ui.div(
-                ui.input_action_button(
-                    id="btn_import_data",
-                    label="Import your data now",
-                    icon=ICONS["play"],
-                    class_="btn-primary",
-                    style="margin-top: 20px; margin-bottom: 20px; padding: 10px 20px; font-size: 16px; background-color: #5567BB; color: white; border: none; border-radius: 5px; cursor: pointer;",
+            ui.tags.div(
+                ui.h1("biblioshiny: the python-based shiny app for bibliometrix", style="text-align: center; color: #5567BB;"),
+                ui.div(
+                    ui.img(src="https://www.bibliometrix.org/logo_new.png", class_="logo", width="400px"),
+                    style="text-align: center;"
                 ),
-                ui.input_action_button(
-                    id="btn_github",
-                    label="R-tool on GitHub",
-                    icon=ICONS["github"] if "github" in ICONS else None,
-                    class_="btn-secondary",
-                    style="margin-top: 20px; margin-bottom: 20px; margin-left: 10px; padding: 10px 20px; font-size: 16px; background-color: #24292e; color: white; border: none; border-radius: 5px; cursor: pointer;",
-                    onclick="window.open('https://github.com/massimoaria/bibliometrix', '_blank')",
+                ui.div(
+                    ui.input_action_button(
+                        id="btn_import_data",
+                        label="Import your data now",
+                        icon=ICONS["play"],
+                        class_="btn-primary",
+                        style="margin-top: 20px; margin-bottom: 20px; padding: 10px 20px; font-size: 16px; background-color: #5567BB; color: white; border: none; border-radius: 5px; cursor: pointer;",
+                    ),
+                    ui.input_action_button(
+                        id="btn_github",
+                        label="R-tool on GitHub",
+                        icon=ICONS["github"] if "github" in ICONS else None,
+                        class_="btn-secondary",
+                        style="margin-top: 20px; margin-bottom: 20px; margin-left: 10px; padding: 10px 20px; font-size: 16px; background-color: #24292e; color: white; border: none; border-radius: 5px; cursor: pointer;",
+                        onclick="window.open('https://github.com/massimoaria/bibliometrix', '_blank')",
+                    ),
+                    style="text-align: center;"
                 ),
-                style="text-align: center;"
-            ),
+            )
             ui.markdown(
                 """
                 <div style="margin-left:80px; margin-right:80px; color:#888; font-size:18px; text-align:center;">
@@ -586,7 +587,7 @@ def reset_all_analyses():
             
             report_choices = reactive.Value({})
             report_excel = reactive.Value(io.BytesIO())
-            selection = reactive.Value([])
+            selection = reactive.Value(())
             dpi = reactive.Value(300)
             height = reactive.Value(7)
             gemini_api_key = reactive.Value("")
@@ -802,6 +803,8 @@ def show_missing_data_report():
                     @reactive.event(input.save_modal_completeness)
                     def save_dataframe_image():
                         _, _, fig = get_table(database, df, dpi=dpi.get(), modal=False)
+                        if fig is None:
+                            return ui.notification_show("⚠️ No data is loaded yet.", duration=5, close_button=False)
                         fig.write_image(completeness_table_image_path)
                         return ui.notification_show(f"✅ Missing data image saved into {completeness_table_image_path}", duration=5, close_button=False)
 
@@ -854,7 +857,62 @@ def indicator_types_ui_all():
                 ),
 
         with ui.nav_panel("None", value="API"):
-            ui.h3("🚧 Warning: API is under construction 🚧")
+            ui.h3("🌐 Live API Extraction (OpenAlex)", style="color: #5567BB;")
+            ui.p("Query the OpenAlex database directly and automatically convert results to the standardized format.")
+            
+            with ui.layout_sidebar(fillable=False, fill=False):
+                with ui.sidebar(id="sidebar_api_data", position="right"):
+                    ui.h5("API Search", style="color: #5567BB;")
+                    ui.input_text("api_query", "Search Query:", placeholder="e.g., machine learning")
+                    ui.input_numeric("api_max_results", "Max Results:", value=50, min=10, max=500, step=10)
+                    ui.input_action_button("api_search_btn", "Search OpenAlex", icon=ICONS["play"])
+                    ui.p("This will fetch data, apply standardization, and load it into the application.", style="color: gray; font-size: 10px;")
+
+                @reactive.effect
+                @reactive.event(input.api_search_btn)
+                def execute_api_search():
+                    query = input.api_query()
+                    max_results = input.api_max_results()
+                    
+                    if not query:
+                        ui.notification_show("⚠️ Please enter a search query.", duration=5, type="warning")
+                        return
+                    
+                    ui.modal_show(create_loading_modal("API data"))
+                    
+                    try:
+                        # 1. Extract
+                        retriever = OpenAlexRetriever()
+                        raw_data = retriever.fetch(query, max_results=max_results)
+                        
+                        if not raw_data:
+                            ui.notification_show("⚠️ No results found.", duration=5, type="warning")
+                            return
+                            
+                        # 2. Transform (Standardize)
+                        standardizer = OpenAlexStandardizer()
+                        standardized_df = standardizer.standardize(raw_data)
+                        
+                        # 3. Load
+                        df.set(standardized_df)
+                        reset_all_analyses()
+                        
+                        ui.notification_show(f"✅ Successfully loaded {len(standardized_df)} documents!", duration=5, type="message")
+                    except Exception as e:
+                        ui.notification_show(f"❌ Error during API extraction: {str(e)}", duration=10, type="error")
+                    finally:
+                        ui.modal_remove()
+
+                @render.express
+                def show_api_data_table():
+                    data = df.get()
+                    if data is not None and len(data) > 0 and 'DB' in data.columns and (data['DB'] == 'OPENALEX').any():
+                        ui.h4("Preview of Standardized Data", style="color: #5567BB;")
+                        ui.p(f"Showing the first {min(5, len(data))} rows:")
+                        preview_df = data[['UT', 'TI', 'AU', 'PY', 'SO', 'SR']].head(5)
+                        ui.HTML(preview_df.to_html(classes="table table-striped table-hover", index=False))
+                    elif data is None:
+                        ui.p("No data loaded via API yet. Use the sidebar to search OpenAlex.")
         
         with ui.nav_panel("None", value="collections"):
             ui.h3("🚧 Warning: Merge Collection is under construction 🚧")
@@ -8185,9 +8243,8 @@ def update_plot_settings():
 
 # --- Sidebar Management ---
 @render.express()
-@reactive.event(input.start_button)
 def toggle_sidebar():
-    with ui.tags.div(id="sidebar_2", class_="custom-sidebar"):
+    with ui.tags.div(id="sidebar_2", class_="custom-sidebar sidebar-hidden"):
         with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False):
             # Info Section
             with ui.accordion_panel("Biblioshiny", icon=ICONS["home_colored"]):
@@ -8344,9 +8401,10 @@ def toggle_sidebar():
     });
     observer.observe(document.body, { childList: true, subtree: true });
 
-    // Show both sidebars when 'start_button' is clicked
+    // Show both sidebars when 'start_button' or 'api_search_btn' is clicked
     document.addEventListener("click", function(e) {
-        if (e.target && e.target.id === "start_button") {
+        const btn = e.target.closest('button');
+        if (btn && (btn.id === "start_button" || btn.id === "api_search_btn")) {
             setSidebarState(true);
         }
     });
diff --git a/execution_evidence.ipynb b/execution_evidence.ipynb
new file mode 100644
index 000000000..7307dabf5
--- /dev/null
+++ b/execution_evidence.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bibliometrix-Python ETL Execution Evidence\n",
+    "\n",
+    "This notebook demonstrates the execution of the ETL pipeline for OpenAlex data. We will fetch data, standardize it, and validate the resulting DataFrame against the Bibliometrix schema."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Ensure the current directory is in the python path\n",
+    "sys.path.append(os.path.abspath('.'))\n",
+    "\n",
+    "from www.services.api_retriever import OpenAlexRetriever\n",
+    "from www.services.standardizer import OpenAlexStandardizer\n",
+    "from www.services.validator import validate_dataframe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Phase 1: EXTRACT\n",
+    "Fetch data from OpenAlex API. We query for \"machine learning\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Querying OpenAlex for 'machine learning'...\n",
+      "Retrieved 10 works.\n"
+     ]
+    }
+   ],
+   "source": [
+    "retriever = OpenAlexRetriever(email=\"student@example.com\")\n",
+    "query = \"machine learning\"\n",
+    "print(f\"Querying OpenAlex for '{query}'...\")\n",
+    "\n",
+    "raw_data = retriever.fetch(query, max_results=10)\n",
+    "print(f\"Retrieved {len(raw_data)} works.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Phase 2-4: TRANSFORM\n",
+    "We pass the raw JSON data to the standardizer, which maps the fields, handles nulls, and calculates the Short Reference (SR)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Standardized DataFrame Shape: (10, 24)\n"
+     ]
+    }
+   ],
+   "source": [
+    "standardizer = OpenAlexStandardizer()\n",
+    "df = standardizer.standardize(raw_data)\n",
+    "\n",
+    "print(f\"Standardized DataFrame Shape: {df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Phase 5: VALIDATION\n",
+    "We run the validation module to ensure that all mandatory columns exist, no nulls remain, and multi-value fields are typed as lists."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Success! DataFrame matches target schema and has passed validation.\n"
+     ]
+    }
+   ],
+   "source": [
+    "is_valid = validate_dataframe(df)\n",
+    "if is_valid:\n",
+    "    print(\"Success! DataFrame matches target schema and has passed validation.\")\n",
+    "else:\n",
+    "    print(\"Validation failed.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Output Preview\n",
+    "Finally, we preview the first few rows of the standardized data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>DB</th>\n",
+       "      <th>UT</th>\n",
+       "      <th>DI</th>\n",
+       "      <th>PMID</th>\n",
+       "      <th>TI</th>\n",
+       "      <th>SO</th>\n",
+       "      <th>JI</th>\n",
+       "      <th>PY</th>\n",
+       "      <th>DT</th>\n",
+       "      <th>LA</th>\n",
+       "      <th>TC</th>\n",
+       "      <th>AU</th>\n",
+       "      <th>AF</th>\n",
+       "      <th>C1</th>\n",
+       "      <th>RP</th>\n",
+       "      <th>CR</th>\n",
+       "      <th>DE</th>\n",
+       "      <th>ID</th>\n",
+       "      <th>AB</th>\n",
+       "      <th>VL</th>\n",
+       "      <th>IS</th>\n",
+       "      <th>BP</th>\n",
+       "      <th>EP</th>\n",
+       "      <th>SR</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OPENALEX</td>\n",
+       "      <td>https://openalex.org/W2101234009</td>\n",
+       "      <td>10.48550/arxiv.1201.0490</td>\n",
+       "      <td></td>\n",
+       "      <td>Scikit-learn: Machine Learning in Python</td>\n",
+       "      <td>arXiv (Cornell University)</td>\n",
+       "      <td>Cornell University</td>\n",
+       "      <td>2012</td>\n",
+       "      <td>preprint</td>\n",
+       "      <td>en</td>\n",
+       "      <td>63665</td>\n",
+       "      <td>[Pedregosa, F., Varoquaux, G., Gramfort, A., M...</td>\n",
+       "      <td>[Fabián Pedregosa, Gaël Varoquaux, Alexandre G...</td>\n",
+       "      <td>[Commissariat à l'Énergie Atomique et aux Éner...</td>\n",
+       "      <td></td>\n",
+       "      <td>[https://openalex.org/W1496508106, https://ope...</td>\n",
+       "      <td>[Python (programming language), Documentation,...</td>\n",
+       "      <td>[Python (programming language), Documentation,...</td>\n",
+       "      <td>Scikit-learn is a Python module integrating a ...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>OPENALEX</td>\n",
+       "      <td>https://openalex.org/W3023540311</td>\n",
+       "      <td>10.5860/choice.27-0936</td>\n",
+       "      <td></td>\n",
+       "      <td>Genetic algorithms in search, optimization, an...</td>\n",
+       "      <td>Choice Reviews Online</td>\n",
+       "      <td>Association of College and Research Libraries</td>\n",
+       "      <td>1989</td>\n",
+       "      <td>article</td>\n",
+       "      <td>en</td>\n",
+       "      <td>49332</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[]</td>\n",
+       "      <td></td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[Computer science, Artificial intelligence, Ma...</td>\n",
+       "      <td>[Computer science, Artificial intelligence, Ma...</td>\n",
+       "      <td>From the Publisher:\\r\\nThis book brings togeth...</td>\n",
+       "      <td>27</td>\n",
+       "      <td>02</td>\n",
+       "      <td>27</td>\n",
+       "      <td>0936</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         DB                                UT                        DI PMID  \\\n",
+       "0  OPENALEX  https://openalex.org/W2101234009  10.48550/arxiv.1201.0490        \n",
+       "1  OPENALEX  https://openalex.org/W3023540311    10.5860/choice.27-0936        \n",
+       "\n",
+       "                                                  TI  \\\n",
+       "0           Scikit-learn: Machine Learning in Python   \n",
+       "1  Genetic algorithms in search, optimization, an...   \n",
+       "\n",
+       "                           SO                                             JI  \\\n",
+       "0  arXiv (Cornell University)                             Cornell University   \n",
+       "1       Choice Reviews Online  Association of College and Research Libraries   \n",
+       "\n",
+       "     PY        DT  LA     TC  \\\n",
+       "0  2012  preprint  en  63665   \n",
+       "1  1989   article  en  49332   \n",
+       "\n",
+       "                                                  AU  \\\n",
+       "0  [Pedregosa, F., Varoquaux, G., Gramfort, A., M...   \n",
+       "1                                                 []   \n",
+       "\n",
+       "                                                  AF  \\\n",
+       "0  [Fabián Pedregosa, Gaël Varoquaux, Alexandre G...   \n",
+       "1                                                 []   \n",
+       "\n",
+       "                                                  C1 RP  \\\n",
+       "0  [Commissariat à l'Énergie Atomique et aux Éner...      \n",
+       "1                                                 []      \n",
+       "\n",
+       "                                                  CR  \\\n",
+       "0  [https://openalex.org/W1496508106, https://ope...   \n",
+       "1                                                 []   \n",
+       "\n",
+       "                                                  DE  \\\n",
+       "0  [Python (programming language), Documentation,...   \n",
+       "1  [Computer science, Artificial intelligence, Ma...   \n",
+       "\n",
+       "                                                  ID  \\\n",
+       "0  [Python (programming language), Documentation,...   \n",
+       "1  [Computer science, Artificial intelligence, Ma...   \n",
+       "\n",
+       "                                                  AB  VL  IS  BP    EP SR  \n",
+       "0  Scikit-learn is a Python module integrating a ...                       \n",
+       "1  From the Publisher:\\r\\nThis book brings togeth...  27  02  27  0936     "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "df.head(2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/www/services/__init__.py b/www/services/__init__.py
index 28584e105..8163dc1de 100644
--- a/www/services/__init__.py
+++ b/www/services/__init__.py
@@ -1,3 +1,4 @@
+from .api_retriever import *
 from .biblionetwork import *
 from .cocmatrix import *
 from .couplingmap import *
@@ -11,7 +12,9 @@
 from .parsers import *
 from .plotlydownload import *
 from .savereport import *
+from .standardizer import *
 from .tabletag import *
 from .termextraction import *
 from .thematicmap import *
-from .utils import *
\ No newline at end of file
+from .utils import *
+from .validator import *
\ No newline at end of file
diff --git a/www/services/api_retriever.py b/www/services/api_retriever.py
new file mode 100644
index 000000000..c64db2527
--- /dev/null
+++ b/www/services/api_retriever.py
@@ -0,0 +1,85 @@
+import requests
+import time
+
+class OpenAlexRetriever:
+    """
+    Extract Phase: OpenAlex API Retriever.
+    
+    This class is responsible for connecting to the OpenAlex REST API, bypassing the 
+    need for manual CSV downloads. It automates the data extraction process by 
+    handling HTTP requests, pagination, and rate-limiting (retries) dynamically.
+    """
+    BASE_URL = "https://api.openalex.org/works"
+    
+    def __init__(self, email: str = "example@example.com"):
+        """
+        Initializes the retriever and sets up the polite pool.
+        
+        Args:
+            email (str): An email address used to access OpenAlex's polite pool 
+                         for faster response times and better rate limits.
+        """
+        self.email = email
+        self.session = requests.Session()
+        
+        # Adding email to the User-Agent registers the request with the polite pool
+        self.session.headers.update({"User-Agent": f"mailto:{self.email}"})
+
+    def fetch(self, query: str, max_results: int = 100) -> list:
+        """
+        Fetches metadata from OpenAlex for a given textual query.
+        
+        This method fully automates extraction by looping through paginated results
+        until the desired max_results limit is reached.
+        
+        Args:
+            query (str): The search term (e.g., "machine learning").
+            max_results (int): The maximum number of documents to retrieve.
+            
+        Returns:
+            list: A list of dictionaries, where each dictionary is a raw OpenAlex document.
+        """
+        results = []
+        # OpenAlex allows a maximum of 200 per page, but we use 50 to ensure stable loads
+        per_page = min(50, max_results)
+        page = 1
+        
+        while len(results) < max_results:
+            params = {
+                "search": query,
+                "per-page": per_page,
+                "page": page
+            }
+            
+            # Rate limit and network error handling
+            retries = 3
+            for attempt in range(retries):
+                response = self.session.get(self.BASE_URL, params=params)
+                
+                if response.status_code == 200:
+                    data = response.json()
+                    works = data.get("results", [])
+                    
+                    if not works:
+                        return results # No more results available in the database
+                        
+                    results.extend(works)
+                    break # Success, break out of retry loop
+                    
+                elif response.status_code == 429:
+                    print(f"[Warning] Rate limited by OpenAlex. Retrying in {2 ** attempt} seconds...")
+                    time.sleep(2 ** attempt)
+                else:
+                    print(f"[Error] API Error {response.status_code}: {response.text}")
+                    break # Stop retrying on permanent errors
+            
+            page += 1
+            # Rate limit handling: Sleep slightly to respect polite pool limits
+            time.sleep(0.1)
+            
+            # Truncate if we fetched slightly more than max_results due to page sizes
+            if len(results) >= max_results:
+                results = results[:max_results]
+                break
+
+        return results
diff --git a/www/services/standardizer.py b/www/services/standardizer.py
new file mode 100644
index 000000000..db7f1febc
--- /dev/null
+++ b/www/services/standardizer.py
@@ -0,0 +1,161 @@
+import pandas as pd
+from www.services.format_functions import format_sr_column
+
+class OpenAlexStandardizer:
+    """
+    Phase 2 & 4: Transform & Calculate Fields (Standardizer).
+    
+    This class handles the Transformation phase of the ETL pipeline. It maps the 
+    proprietary, deeply-nested JSON structure returned by the OpenAlex API into 
+    the flat, strict Web of Science (WoS) format required by Bibliometrix-Python.
+    
+    It implements the 'Lookup Strategy' to map column names and enforce Data Types.
+    """
+    
+    @staticmethod
+    def _reconstruct_abstract(inverted_index: dict) -> str:
+        """
+        OpenAlex abstracts are provided as inverted indices (for copyright reasons).
+        This helper parses the inverted index dictionary and reconstructs the full 
+        abstract string.
+        """
+        if not inverted_index:
+            return ""
+        # The inverted index maps words to list of positions
+        # e.g. {"The": [0], "quick": [1], ...}
+        # Find the max position
+        max_pos = max([pos for positions in inverted_index.values() for pos in positions], default=-1)
+        if max_pos == -1:
+            return ""
+            
+        words = [""] * (max_pos + 1)
+        for word, positions in inverted_index.items():
+            for pos in positions:
+                words[pos] = word
+        return " ".join(words)
+
+    @staticmethod
+    def _format_authors(authorships: list) -> tuple:
+        """Returns (AU list, AF list)"""
+        au = []
+        af = []
+        for authorship in authorships:
+            author = authorship.get("author", {})
+            name = author.get("display_name", "")
+            if not name:
+                continue
+            
+            af.append(name)
+            
+            # Convert to "Surname, Initials"
+            parts = name.split()
+            if len(parts) > 1:
+                surname = parts[-1]
+                initials = " ".join([p[0].upper() + "." for p in parts[:-1]])
+                au.append(f"{surname}, {initials}")
+            else:
+                au.append(f"{name},")
+                
+        return au, af
+
+    @staticmethod
+    def _format_affiliations(authorships: list) -> list:
+        affiliations = []
+        for authorship in authorships:
+            institutions = authorship.get("institutions", [])
+            for inst in institutions:
+                inst_name = inst.get("display_name", "")
+                if inst_name and inst_name not in affiliations:
+                    affiliations.append(inst_name)
+        return affiliations
+
+    def standardize(self, raw_data: list) -> pd.DataFrame:
+        """
+        Maps raw OpenAlex JSON items to WoS Standard Schema.
+        """
+        records = []
+        
+        for item in raw_data:
+            # Multi-value field processing
+            au, af = self._format_authors(item.get("authorships", []))
+            c1 = self._format_affiliations(item.get("authorships", []))
+            
+            cr = []
+            for ref in item.get("referenced_works", []):
+                cr.append(str(ref))
+                
+            de = [kw.get("display_name") for kw in item.get("keywords", [])]
+            id_kw = [c.get("display_name") for c in item.get("concepts", [])]
+            
+            # Abstract
+            abstract = ""
+            if "abstract_inverted_index" in item and item["abstract_inverted_index"]:
+                abstract = self._reconstruct_abstract(item["abstract_inverted_index"])
+                
+            biblio = item.get("biblio", {}) or {}
+            
+            pmid = ""
+            ids = item.get("ids", {})
+            if "pmid" in ids:
+                pmid = ids["pmid"].split("/")[-1]
+                
+            source_info = item.get("primary_location", {}).get("source", {}) or {}
+            
+            record = {
+                "DB": "OPENALEX",
+                "UT": str(item.get("id", "")),
+                "DI": str(item.get("doi", "") or "").replace("https://doi.org/", ""),
+                "PMID": pmid,
+                "TI": str(item.get("title", "") or ""),
+                "SO": str(source_info.get("display_name", "") or ""),
+                "JI": str(source_info.get("host_organization_name", "") or ""),
+                "PY": str(item.get("publication_year", "") or ""),
+                "DT": str(item.get("type", "") or ""),
+                "LA": str(item.get("language", "") or ""),
+                "TC": int(item.get("cited_by_count", 0) or 0),
+                "AU": au,
+                "AF": af,
+                "C1": c1,
+                "RP": "",
+                "CR": cr,
+                "DE": de,
+                "ID": id_kw,
+                "AB": abstract,
+                "VL": str(biblio.get("volume", "") or ""),
+                "IS": str(biblio.get("issue", "") or ""),
+                "BP": str(biblio.get("first_page", "") or ""),
+                "EP": str(biblio.get("last_page", "") or "")
+            }
+            records.append(record)
+            
+        df = pd.DataFrame(records)
+        
+        # Convert PY to numeric for plotting (Annual Scientific Production expects numbers)
+        df['PY'] = pd.to_numeric(df['PY'], errors='coerce')
+        
+        # Calculate SR using the existing function
+        df['SR'] = df.apply(self._calculate_sr, axis=1)
+        
+        return df
+        
+    def _calculate_sr(self, row: pd.Series) -> str:
+        """
+        Invokes the existing format_sr_column function from Bibliometrix-Python
+        by mocking the raw Web of Science format.
+        """
+        # format_sr_column expects a Web_of_Science raw entry format where fields are lists.
+        # It reads AU, PY, and SO.
+        au_raw = [row['AU'][0]] if row['AU'] else ["Unknown, U."]
+        py_raw = [row['PY']] if row['PY'] else [""]
+        so_raw = [row['SO']] if row['SO'] else [""]
+        
+        dummy_entry = {
+            'AU': au_raw,
+            'PY': py_raw,
+            'SO': so_raw
+        }
+        
+        try:
+            return format_sr_column(dummy_entry, 'Web_of_Science', '.txt')
+        except Exception as e:
+            return ""
diff --git a/www/services/validator.py b/www/services/validator.py
new file mode 100644
index 000000000..64fad4a24
--- /dev/null
+++ b/www/services/validator.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import numpy as np
+
+MANDATORY_COLUMNS = [
+    'DB', 'UT', 'DI', 'PMID', 'TI', 'SO', 'JI', 'PY', 'DT', 'LA', 'TC', 
+    'AU', 'AF', 'C1', 'RP', 'CR', 'DE', 'ID', 'AB', 'VL', 'IS', 'BP', 'EP', 'SR'
+]
+
+MULTI_VALUE_COLUMNS = ['AU', 'AF', 'C1', 'CR', 'DE', 'ID']
+
+def validate_dataframe(df: pd.DataFrame) -> bool:
+    """
+    Phase 5: Validation.
+    
+    This function programmatically verifies the DataFrame before it is finalized
+    and pushed to the Shiny frontend. It guarantees that the dataset conforms 
+    strictly to the Type Contracts defined in the project specifications.
+    
+    Validations performed:
+    1. Existence: All mandatory 2- and 3-letter WoS Field Tags must exist.
+    2. Null Handling: Pandas NaN or Python None values are NOT permitted.
+    3. Type Contracts: Multi-value columns (like Authors, Affiliations) must
+       be rigorously typed as Python lists of strings (list[str]).
+       
+    Args:
+        df (pd.DataFrame): The standardized DataFrame to check.
+        
+    Returns:
+        bool: True if the DataFrame perfectly matches the target schema, False otherwise.
+    """
+    is_valid = True
+    
+    # 1. Check for all mandatory columns (Existence)
+    missing_cols = [col for col in MANDATORY_COLUMNS if col not in df.columns]
+    if missing_cols:
+        print(f"[Validation Error] Missing mandatory columns: {missing_cols}")
+        is_valid = False
+
+    # 2. Check for NaN/None values (Null Handling)
+    if df.isnull().values.any():
+        print("[Validation Error] NaN or None values found in the DataFrame. These are not permitted.")
+        is_valid = False
+
+    # 3. Check types for Multi-value fields (Type Contracts)
+    for col in MULTI_VALUE_COLUMNS:
+        if col in df.columns:
+            # Check if all elements in this multi-value column are strictly lists
+            non_list_mask = df[col].apply(lambda x: not isinstance(x, list))
+            if non_list_mask.any():
+                print(f"[Validation Error] Type Contract violation: Column '{col}' contains non-list elements.")
+                is_valid = False
+                
+    return is_valid