From 48f002d7031d5058add2dfb1315e6ac3712e3036 Mon Sep 17 00:00:00 2001 From: NihalNawaz Date: Tue, 2 Jun 2026 00:06:30 +0200 Subject: [PATCH] Complete ETL Pipeline (Advanced Level) --- app.py | 122 +++++++++---- execution_evidence.ipynb | 315 ++++++++++++++++++++++++++++++++++ www/services/__init__.py | 5 +- www/services/api_retriever.py | 85 +++++++++ www/services/standardizer.py | 161 +++++++++++++++++ www/services/validator.py | 53 ++++++ 6 files changed, 708 insertions(+), 33 deletions(-) create mode 100644 execution_evidence.ipynb create mode 100644 www/services/api_retriever.py create mode 100644 www/services/standardizer.py create mode 100644 www/services/validator.py diff --git a/app.py b/app.py index f0891f894..79a880f70 100644 --- a/app.py +++ b/app.py @@ -47,11 +47,11 @@ # Import necessary libraries for better performance - avoid importing everything -import tempfile import os import requests import functools from datetime import datetime +from pathlib import Path import pandas as pd import io from functions import * @@ -64,9 +64,8 @@ from shinywidgets import render_widget from shiny.express import ui, input, render -# Setup the Directory for static assets - optimized for performance -base_dir = tempfile.gettempdir() # Use system temp dir instead of creating new temp file -express.app_opts(static_assets=base_dir, debug=False) +# Setup the directory for static assets relative to the app file. +app_root = Path(__file__).resolve().parent # --- Toggle button --- # This button toggles the visibility of the sidebar(s) in the UI. @@ -81,7 +80,7 @@ # --- UI and UX experience --- # Include custom CSS for the app's appearance. -ui.include_css("www/static/biblioshiny.css") +ui.include_css(app_root / "www/static/biblioshiny.css") # --- Header --- # The header bar contains the logo, app name, and a set of dropdown menus for notifications, help, donations, and credits. @@ -252,29 +251,31 @@ def get_latest_cran_version(): # --- Welcome/Info Page --- with ui.nav_panel("None", value="info"): - ui.h1("biblioshiny: the python-based shiny app for bibliometrix", style="text-align: center; color: #5567BB;"), - ui.div( - ui.img(src="https://www.bibliometrix.org/logo_new.png", class_="logo", width="400px"), - style="text-align: center;" - ), - ui.div( - ui.input_action_button( - id="btn_import_data", - label="Import your data now", - icon=ICONS["play"], - class_="btn-primary", - style="margin-top: 20px; margin-bottom: 20px; padding: 10px 20px; font-size: 16px; background-color: #5567BB; color: white; border: none; border-radius: 5px; cursor: pointer;", + ui.tags.div( + ui.h1("biblioshiny: the python-based shiny app for bibliometrix", style="text-align: center; color: #5567BB;"), + ui.div( + ui.img(src="https://www.bibliometrix.org/logo_new.png", class_="logo", width="400px"), + style="text-align: center;" ), - ui.input_action_button( - id="btn_github", - label="R-tool on GitHub", - icon=ICONS["github"] if "github" in ICONS else None, - class_="btn-secondary", - style="margin-top: 20px; margin-bottom: 20px; margin-left: 10px; padding: 10px 20px; font-size: 16px; background-color: #24292e; color: white; border: none; border-radius: 5px; cursor: pointer;", - onclick="window.open('https://github.com/massimoaria/bibliometrix', '_blank')", + ui.div( + ui.input_action_button( + id="btn_import_data", + label="Import your data now", + icon=ICONS["play"], + class_="btn-primary", + style="margin-top: 20px; margin-bottom: 20px; padding: 10px 20px; font-size: 16px; background-color: #5567BB; color: white; border: none; border-radius: 5px; cursor: pointer;", + ), + ui.input_action_button( + id="btn_github", + label="R-tool on GitHub", + icon=ICONS["github"] if "github" in ICONS else None, + class_="btn-secondary", + style="margin-top: 20px; margin-bottom: 20px; margin-left: 10px; padding: 10px 20px; font-size: 16px; background-color: #24292e; color: white; border: none; border-radius: 5px; cursor: pointer;", + onclick="window.open('https://github.com/massimoaria/bibliometrix', '_blank')", + ), + style="text-align: center;" ), - style="text-align: center;" - ), + ) ui.markdown( """
@@ -586,7 +587,7 @@ def reset_all_analyses(): report_choices = reactive.Value({}) report_excel = reactive.Value(io.BytesIO()) - selection = reactive.Value([]) + selection = reactive.Value(()) dpi = reactive.Value(300) height = reactive.Value(7) gemini_api_key = reactive.Value("") @@ -802,6 +803,8 @@ def show_missing_data_report(): @reactive.event(input.save_modal_completeness) def save_dataframe_image(): _, _, fig = get_table(database, df, dpi=dpi.get(), modal=False) + if fig is None: + return ui.notification_show("⚠️ No data is loaded yet.", duration=5, close_button=False) fig.write_image(completeness_table_image_path) return ui.notification_show(f"✅ Missing data image saved into {completeness_table_image_path}", duration=5, close_button=False) @@ -854,7 +857,62 @@ def indicator_types_ui_all(): ), with ui.nav_panel("None", value="API"): - ui.h3("🚧 Warning: API is under construction 🚧") + ui.h3("🌐 Live API Extraction (OpenAlex)", style="color: #5567BB;") + ui.p("Query the OpenAlex database directly and automatically convert results to the standardized format.") + + with ui.layout_sidebar(fillable=False, fill=False): + with ui.sidebar(id="sidebar_api_data", position="right"): + ui.h5("API Search", style="color: #5567BB;") + ui.input_text("api_query", "Search Query:", placeholder="e.g., machine learning") + ui.input_numeric("api_max_results", "Max Results:", value=50, min=10, max=500, step=10) + ui.input_action_button("api_search_btn", "Search OpenAlex", icon=ICONS["play"]) + ui.p("This will fetch data, apply standardization, and load it into the application.", style="color: gray; font-size: 10px;") + + @reactive.effect + @reactive.event(input.api_search_btn) + def execute_api_search(): + query = input.api_query() + max_results = input.api_max_results() + + if not query: + ui.notification_show("⚠️ Please enter a search query.", duration=5, type="warning") + return + + ui.modal_show(create_loading_modal("API data")) + + try: + # 1. Extract + retriever = OpenAlexRetriever() + raw_data = retriever.fetch(query, max_results=max_results) + + if not raw_data: + ui.notification_show("⚠️ No results found.", duration=5, type="warning") + return + + # 2. Transform (Standardize) + standardizer = OpenAlexStandardizer() + standardized_df = standardizer.standardize(raw_data) + + # 3. Load + df.set(standardized_df) + reset_all_analyses() + + ui.notification_show(f"✅ Successfully loaded {len(standardized_df)} documents!", duration=5, type="message") + except Exception as e: + ui.notification_show(f"❌ Error during API extraction: {str(e)}", duration=10, type="error") + finally: + ui.modal_remove() + + @render.express + def show_api_data_table(): + data = df.get() + if data is not None and len(data) > 0 and 'DB' in data.columns and (data['DB'] == 'OPENALEX').any(): + ui.h4("Preview of Standardized Data", style="color: #5567BB;") + ui.p(f"Showing the first {min(5, len(data))} rows:") + preview_df = data[['UT', 'TI', 'AU', 'PY', 'SO', 'SR']].head(5) + ui.HTML(preview_df.to_html(classes="table table-striped table-hover", index=False)) + elif data is None: + ui.p("No data loaded via API yet. Use the sidebar to search OpenAlex.") with ui.nav_panel("None", value="collections"): ui.h3("🚧 Warning: Merge Collection is under construction 🚧") @@ -8185,9 +8243,8 @@ def update_plot_settings(): # --- Sidebar Management --- @render.express() -@reactive.event(input.start_button) def toggle_sidebar(): - with ui.tags.div(id="sidebar_2", class_="custom-sidebar"): + with ui.tags.div(id="sidebar_2", class_="custom-sidebar sidebar-hidden"): with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False): # Info Section with ui.accordion_panel("Biblioshiny", icon=ICONS["home_colored"]): @@ -8344,9 +8401,10 @@ def toggle_sidebar(): }); observer.observe(document.body, { childList: true, subtree: true }); - // Show both sidebars when 'start_button' is clicked + // Show both sidebars when 'start_button' or 'api_search_btn' is clicked document.addEventListener("click", function(e) { - if (e.target && e.target.id === "start_button") { + const btn = e.target.closest('button'); + if (btn && (btn.id === "start_button" || btn.id === "api_search_btn")) { setSidebarState(true); } }); diff --git a/execution_evidence.ipynb b/execution_evidence.ipynb new file mode 100644 index 000000000..7307dabf5 --- /dev/null +++ b/execution_evidence.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bibliometrix-Python ETL Execution Evidence\n", + "\n", + "This notebook demonstrates the execution of the ETL pipeline for OpenAlex data. We will fetch data, standardize it, and validate the resulting DataFrame against the Bibliometrix schema." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Ensure the current directory is in the python path\n", + "sys.path.append(os.path.abspath('.'))\n", + "\n", + "from www.services.api_retriever import OpenAlexRetriever\n", + "from www.services.standardizer import OpenAlexStandardizer\n", + "from www.services.validator import validate_dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Phase 1: EXTRACT\n", + "Fetch data from OpenAlex API. We query for \"machine learning\"." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Querying OpenAlex for 'machine learning'...\n", + "Retrieved 10 works.\n" + ] + } + ], + "source": [ + "retriever = OpenAlexRetriever(email=\"student@example.com\")\n", + "query = \"machine learning\"\n", + "print(f\"Querying OpenAlex for '{query}'...\")\n", + "\n", + "raw_data = retriever.fetch(query, max_results=10)\n", + "print(f\"Retrieved {len(raw_data)} works.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Phase 2-4: TRANSFORM\n", + "We pass the raw JSON data to the standardizer, which maps the fields, handles nulls, and calculates the Short Reference (SR)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Standardized DataFrame Shape: (10, 24)\n" + ] + } + ], + "source": [ + "standardizer = OpenAlexStandardizer()\n", + "df = standardizer.standardize(raw_data)\n", + "\n", + "print(f\"Standardized DataFrame Shape: {df.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Phase 5: VALIDATION\n", + "We run the validation module to ensure that all mandatory columns exist, no nulls remain, and multi-value fields are typed as lists." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success! DataFrame matches target schema and has passed validation.\n" + ] + } + ], + "source": [ + "is_valid = validate_dataframe(df)\n", + "if is_valid:\n", + " print(\"Success! DataFrame matches target schema and has passed validation.\")\n", + "else:\n", + " print(\"Validation failed.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Preview\n", + "Finally, we preview the first few rows of the standardized data." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DBUTDIPMIDTISOJIPYDTLATCAUAFC1RPCRDEIDABVLISBPEPSR
0OPENALEXhttps://openalex.org/W210123400910.48550/arxiv.1201.0490Scikit-learn: Machine Learning in PythonarXiv (Cornell University)Cornell University2012preprinten63665[Pedregosa, F., Varoquaux, G., Gramfort, A., M...[Fabián Pedregosa, Gaël Varoquaux, Alexandre G...[Commissariat à l'Énergie Atomique et aux Éner...[https://openalex.org/W1496508106, https://ope...[Python (programming language), Documentation,...[Python (programming language), Documentation,...Scikit-learn is a Python module integrating a ...
1OPENALEXhttps://openalex.org/W302354031110.5860/choice.27-0936Genetic algorithms in search, optimization, an...Choice Reviews OnlineAssociation of College and Research Libraries1989articleen49332[][][][][Computer science, Artificial intelligence, Ma...[Computer science, Artificial intelligence, Ma...From the Publisher:\\r\\nThis book brings togeth...2702270936
\n", + "
" + ], + "text/plain": [ + " DB UT DI PMID \\\n", + "0 OPENALEX https://openalex.org/W2101234009 10.48550/arxiv.1201.0490 \n", + "1 OPENALEX https://openalex.org/W3023540311 10.5860/choice.27-0936 \n", + "\n", + " TI \\\n", + "0 Scikit-learn: Machine Learning in Python \n", + "1 Genetic algorithms in search, optimization, an... \n", + "\n", + " SO JI \\\n", + "0 arXiv (Cornell University) Cornell University \n", + "1 Choice Reviews Online Association of College and Research Libraries \n", + "\n", + " PY DT LA TC \\\n", + "0 2012 preprint en 63665 \n", + "1 1989 article en 49332 \n", + "\n", + " AU \\\n", + "0 [Pedregosa, F., Varoquaux, G., Gramfort, A., M... \n", + "1 [] \n", + "\n", + " AF \\\n", + "0 [Fabián Pedregosa, Gaël Varoquaux, Alexandre G... \n", + "1 [] \n", + "\n", + " C1 RP \\\n", + "0 [Commissariat à l'Énergie Atomique et aux Éner... \n", + "1 [] \n", + "\n", + " CR \\\n", + "0 [https://openalex.org/W1496508106, https://ope... \n", + "1 [] \n", + "\n", + " DE \\\n", + "0 [Python (programming language), Documentation,... \n", + "1 [Computer science, Artificial intelligence, Ma... \n", + "\n", + " ID \\\n", + "0 [Python (programming language), Documentation,... \n", + "1 [Computer science, Artificial intelligence, Ma... \n", + "\n", + " AB VL IS BP EP SR \n", + "0 Scikit-learn is a Python module integrating a ... \n", + "1 From the Publisher:\\r\\nThis book brings togeth... 27 02 27 0936 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.set_option('display.max_columns', None)\n", + "df.head(2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/www/services/__init__.py b/www/services/__init__.py index 28584e105..8163dc1de 100644 --- a/www/services/__init__.py +++ b/www/services/__init__.py @@ -1,3 +1,4 @@ +from .api_retriever import * from .biblionetwork import * from .cocmatrix import * from .couplingmap import * @@ -11,7 +12,9 @@ from .parsers import * from .plotlydownload import * from .savereport import * +from .standardizer import * from .tabletag import * from .termextraction import * from .thematicmap import * -from .utils import * \ No newline at end of file +from .utils import * +from .validator import * \ No newline at end of file diff --git a/www/services/api_retriever.py b/www/services/api_retriever.py new file mode 100644 index 000000000..c64db2527 --- /dev/null +++ b/www/services/api_retriever.py @@ -0,0 +1,85 @@ +import requests +import time + +class OpenAlexRetriever: + """ + Extract Phase: OpenAlex API Retriever. + + This class is responsible for connecting to the OpenAlex REST API, bypassing the + need for manual CSV downloads. It automates the data extraction process by + handling HTTP requests, pagination, and rate-limiting (retries) dynamically. + """ + BASE_URL = "https://api.openalex.org/works" + + def __init__(self, email: str = "example@example.com"): + """ + Initializes the retriever and sets up the polite pool. + + Args: + email (str): An email address used to access OpenAlex's polite pool + for faster response times and better rate limits. + """ + self.email = email + self.session = requests.Session() + + # Adding email to the User-Agent registers the request with the polite pool + self.session.headers.update({"User-Agent": f"mailto:{self.email}"}) + + def fetch(self, query: str, max_results: int = 100) -> list: + """ + Fetches metadata from OpenAlex for a given textual query. + + This method fully automates extraction by looping through paginated results + until the desired max_results limit is reached. + + Args: + query (str): The search term (e.g., "machine learning"). + max_results (int): The maximum number of documents to retrieve. + + Returns: + list: A list of dictionaries, where each dictionary is a raw OpenAlex document. + """ + results = [] + # OpenAlex allows a maximum of 200 per page, but we use 50 to ensure stable loads + per_page = min(50, max_results) + page = 1 + + while len(results) < max_results: + params = { + "search": query, + "per-page": per_page, + "page": page + } + + # Rate limit and network error handling + retries = 3 + for attempt in range(retries): + response = self.session.get(self.BASE_URL, params=params) + + if response.status_code == 200: + data = response.json() + works = data.get("results", []) + + if not works: + return results # No more results available in the database + + results.extend(works) + break # Success, break out of retry loop + + elif response.status_code == 429: + print(f"[Warning] Rate limited by OpenAlex. Retrying in {2 ** attempt} seconds...") + time.sleep(2 ** attempt) + else: + print(f"[Error] API Error {response.status_code}: {response.text}") + break # Stop retrying on permanent errors + + page += 1 + # Rate limit handling: Sleep slightly to respect polite pool limits + time.sleep(0.1) + + # Truncate if we fetched slightly more than max_results due to page sizes + if len(results) >= max_results: + results = results[:max_results] + break + + return results diff --git a/www/services/standardizer.py b/www/services/standardizer.py new file mode 100644 index 000000000..db7f1febc --- /dev/null +++ b/www/services/standardizer.py @@ -0,0 +1,161 @@ +import pandas as pd +from www.services.format_functions import format_sr_column + +class OpenAlexStandardizer: + """ + Phase 2 & 4: Transform & Calculate Fields (Standardizer). + + This class handles the Transformation phase of the ETL pipeline. It maps the + proprietary, deeply-nested JSON structure returned by the OpenAlex API into + the flat, strict Web of Science (WoS) format required by Bibliometrix-Python. + + It implements the 'Lookup Strategy' to map column names and enforce Data Types. + """ + + @staticmethod + def _reconstruct_abstract(inverted_index: dict) -> str: + """ + OpenAlex abstracts are provided as inverted indices (for copyright reasons). + This helper parses the inverted index dictionary and reconstructs the full + abstract string. + """ + if not inverted_index: + return "" + # The inverted index maps words to list of positions + # e.g. {"The": [0], "quick": [1], ...} + # Find the max position + max_pos = max([pos for positions in inverted_index.values() for pos in positions], default=-1) + if max_pos == -1: + return "" + + words = [""] * (max_pos + 1) + for word, positions in inverted_index.items(): + for pos in positions: + words[pos] = word + return " ".join(words) + + @staticmethod + def _format_authors(authorships: list) -> tuple: + """Returns (AU list, AF list)""" + au = [] + af = [] + for authorship in authorships: + author = authorship.get("author", {}) + name = author.get("display_name", "") + if not name: + continue + + af.append(name) + + # Convert to "Surname, Initials" + parts = name.split() + if len(parts) > 1: + surname = parts[-1] + initials = " ".join([p[0].upper() + "." for p in parts[:-1]]) + au.append(f"{surname}, {initials}") + else: + au.append(f"{name},") + + return au, af + + @staticmethod + def _format_affiliations(authorships: list) -> list: + affiliations = [] + for authorship in authorships: + institutions = authorship.get("institutions", []) + for inst in institutions: + inst_name = inst.get("display_name", "") + if inst_name and inst_name not in affiliations: + affiliations.append(inst_name) + return affiliations + + def standardize(self, raw_data: list) -> pd.DataFrame: + """ + Maps raw OpenAlex JSON items to WoS Standard Schema. + """ + records = [] + + for item in raw_data: + # Multi-value field processing + au, af = self._format_authors(item.get("authorships", [])) + c1 = self._format_affiliations(item.get("authorships", [])) + + cr = [] + for ref in item.get("referenced_works", []): + cr.append(str(ref)) + + de = [kw.get("display_name") for kw in item.get("keywords", [])] + id_kw = [c.get("display_name") for c in item.get("concepts", [])] + + # Abstract + abstract = "" + if "abstract_inverted_index" in item and item["abstract_inverted_index"]: + abstract = self._reconstruct_abstract(item["abstract_inverted_index"]) + + biblio = item.get("biblio", {}) or {} + + pmid = "" + ids = item.get("ids", {}) + if "pmid" in ids: + pmid = ids["pmid"].split("/")[-1] + + source_info = item.get("primary_location", {}).get("source", {}) or {} + + record = { + "DB": "OPENALEX", + "UT": str(item.get("id", "")), + "DI": str(item.get("doi", "") or "").replace("https://doi.org/", ""), + "PMID": pmid, + "TI": str(item.get("title", "") or ""), + "SO": str(source_info.get("display_name", "") or ""), + "JI": str(source_info.get("host_organization_name", "") or ""), + "PY": str(item.get("publication_year", "") or ""), + "DT": str(item.get("type", "") or ""), + "LA": str(item.get("language", "") or ""), + "TC": int(item.get("cited_by_count", 0) or 0), + "AU": au, + "AF": af, + "C1": c1, + "RP": "", + "CR": cr, + "DE": de, + "ID": id_kw, + "AB": abstract, + "VL": str(biblio.get("volume", "") or ""), + "IS": str(biblio.get("issue", "") or ""), + "BP": str(biblio.get("first_page", "") or ""), + "EP": str(biblio.get("last_page", "") or "") + } + records.append(record) + + df = pd.DataFrame(records) + + # Convert PY to numeric for plotting (Annual Scientific Production expects numbers) + df['PY'] = pd.to_numeric(df['PY'], errors='coerce') + + # Calculate SR using the existing function + df['SR'] = df.apply(self._calculate_sr, axis=1) + + return df + + def _calculate_sr(self, row: pd.Series) -> str: + """ + Invokes the existing format_sr_column function from Bibliometrix-Python + by mocking the raw Web of Science format. + """ + # format_sr_column expects a Web_of_Science raw entry format where fields are lists. + # It reads AU, PY, and SO. + au_raw = [row['AU'][0]] if row['AU'] else ["Unknown, U."] + py_raw = [row['PY']] if row['PY'] else [""] + so_raw = [row['SO']] if row['SO'] else [""] + + dummy_entry = { + 'AU': au_raw, + 'PY': py_raw, + 'SO': so_raw + } + + try: + return format_sr_column(dummy_entry, 'Web_of_Science', '.txt') + except Exception as e: + return "" diff --git a/www/services/validator.py b/www/services/validator.py new file mode 100644 index 000000000..64fad4a24 --- /dev/null +++ b/www/services/validator.py @@ -0,0 +1,53 @@ +import pandas as pd +import numpy as np + +MANDATORY_COLUMNS = [ + 'DB', 'UT', 'DI', 'PMID', 'TI', 'SO', 'JI', 'PY', 'DT', 'LA', 'TC', + 'AU', 'AF', 'C1', 'RP', 'CR', 'DE', 'ID', 'AB', 'VL', 'IS', 'BP', 'EP', 'SR' +] + +MULTI_VALUE_COLUMNS = ['AU', 'AF', 'C1', 'CR', 'DE', 'ID'] + +def validate_dataframe(df: pd.DataFrame) -> bool: + """ + Phase 5: Validation. + + This function programmatically verifies the DataFrame before it is finalized + and pushed to the Shiny frontend. It guarantees that the dataset conforms + strictly to the Type Contracts defined in the project specifications. + + Validations performed: + 1. Existence: All mandatory 2- and 3-letter WoS Field Tags must exist. + 2. Null Handling: Pandas NaN or Python None values are NOT permitted. + 3. Type Contracts: Multi-value columns (like Authors, Affiliations) must + be rigorously typed as Python lists of strings (list[str]). + + Args: + df (pd.DataFrame): The standardized DataFrame to check. + + Returns: + bool: True if the DataFrame perfectly matches the target schema, False otherwise. + """ + is_valid = True + + # 1. Check for all mandatory columns (Existence) + missing_cols = [col for col in MANDATORY_COLUMNS if col not in df.columns] + if missing_cols: + print(f"[Validation Error] Missing mandatory columns: {missing_cols}") + is_valid = False + + # 2. Check for NaN/None values (Null Handling) + if df.isnull().values.any(): + print("[Validation Error] NaN or None values found in the DataFrame. These are not permitted.") + is_valid = False + + # 3. Check types for Multi-value fields (Type Contracts) + for col in MULTI_VALUE_COLUMNS: + if col in df.columns: + # Check if all elements in this multi-value column are strictly lists + non_list_mask = df[col].apply(lambda x: not isinstance(x, list)) + if non_list_mask.any(): + print(f"[Validation Error] Type Contract violation: Column '{col}' contains non-list elements.") + is_valid = False + + return is_valid