diff --git a/app.py b/app.py
index f0891f894..79a880f70 100644
--- a/app.py
+++ b/app.py
@@ -47,11 +47,11 @@
# Import necessary libraries for better performance - avoid importing everything
-import tempfile
import os
import requests
import functools
from datetime import datetime
+from pathlib import Path
import pandas as pd
import io
from functions import *
@@ -64,9 +64,8 @@
from shinywidgets import render_widget
from shiny.express import ui, input, render
-# Setup the Directory for static assets - optimized for performance
-base_dir = tempfile.gettempdir() # Use system temp dir instead of creating new temp file
-express.app_opts(static_assets=base_dir, debug=False)
+# Setup the directory for static assets relative to the app file.
+app_root = Path(__file__).resolve().parent
# --- Toggle button ---
# This button toggles the visibility of the sidebar(s) in the UI.
@@ -81,7 +80,7 @@
# --- UI and UX experience ---
# Include custom CSS for the app's appearance.
-ui.include_css("www/static/biblioshiny.css")
+ui.include_css(app_root / "www/static/biblioshiny.css")
# --- Header ---
# The header bar contains the logo, app name, and a set of dropdown menus for notifications, help, donations, and credits.
@@ -252,29 +251,31 @@ def get_latest_cran_version():
# --- Welcome/Info Page ---
with ui.nav_panel("None", value="info"):
- ui.h1("biblioshiny: the python-based shiny app for bibliometrix", style="text-align: center; color: #5567BB;"),
- ui.div(
- ui.img(src="https://www.bibliometrix.org/logo_new.png", class_="logo", width="400px"),
- style="text-align: center;"
- ),
- ui.div(
- ui.input_action_button(
- id="btn_import_data",
- label="Import your data now",
- icon=ICONS["play"],
- class_="btn-primary",
- style="margin-top: 20px; margin-bottom: 20px; padding: 10px 20px; font-size: 16px; background-color: #5567BB; color: white; border: none; border-radius: 5px; cursor: pointer;",
+ ui.tags.div(
+ ui.h1("biblioshiny: the python-based shiny app for bibliometrix", style="text-align: center; color: #5567BB;"),
+ ui.div(
+ ui.img(src="https://www.bibliometrix.org/logo_new.png", class_="logo", width="400px"),
+ style="text-align: center;"
),
- ui.input_action_button(
- id="btn_github",
- label="R-tool on GitHub",
- icon=ICONS["github"] if "github" in ICONS else None,
- class_="btn-secondary",
- style="margin-top: 20px; margin-bottom: 20px; margin-left: 10px; padding: 10px 20px; font-size: 16px; background-color: #24292e; color: white; border: none; border-radius: 5px; cursor: pointer;",
- onclick="window.open('https://github.com/massimoaria/bibliometrix', '_blank')",
+ ui.div(
+ ui.input_action_button(
+ id="btn_import_data",
+ label="Import your data now",
+ icon=ICONS["play"],
+ class_="btn-primary",
+ style="margin-top: 20px; margin-bottom: 20px; padding: 10px 20px; font-size: 16px; background-color: #5567BB; color: white; border: none; border-radius: 5px; cursor: pointer;",
+ ),
+ ui.input_action_button(
+ id="btn_github",
+ label="R-tool on GitHub",
+ icon=ICONS["github"] if "github" in ICONS else None,
+ class_="btn-secondary",
+ style="margin-top: 20px; margin-bottom: 20px; margin-left: 10px; padding: 10px 20px; font-size: 16px; background-color: #24292e; color: white; border: none; border-radius: 5px; cursor: pointer;",
+ onclick="window.open('https://github.com/massimoaria/bibliometrix', '_blank')",
+ ),
+ style="text-align: center;"
),
- style="text-align: center;"
- ),
+ )
ui.markdown(
"""
@@ -586,7 +587,7 @@ def reset_all_analyses():
report_choices = reactive.Value({})
report_excel = reactive.Value(io.BytesIO())
- selection = reactive.Value([])
+ selection = reactive.Value(())
dpi = reactive.Value(300)
height = reactive.Value(7)
gemini_api_key = reactive.Value("")
@@ -802,6 +803,8 @@ def show_missing_data_report():
@reactive.event(input.save_modal_completeness)
def save_dataframe_image():
_, _, fig = get_table(database, df, dpi=dpi.get(), modal=False)
+ if fig is None:
+ return ui.notification_show("⚠️ No data is loaded yet.", duration=5, close_button=False)
fig.write_image(completeness_table_image_path)
return ui.notification_show(f"✅ Missing data image saved into {completeness_table_image_path}", duration=5, close_button=False)
@@ -854,7 +857,62 @@ def indicator_types_ui_all():
),
with ui.nav_panel("None", value="API"):
- ui.h3("🚧 Warning: API is under construction 🚧")
+ ui.h3("🌐 Live API Extraction (OpenAlex)", style="color: #5567BB;")
+ ui.p("Query the OpenAlex database directly and automatically convert results to the standardized format.")
+
+ with ui.layout_sidebar(fillable=False, fill=False):
+ with ui.sidebar(id="sidebar_api_data", position="right"):
+ ui.h5("API Search", style="color: #5567BB;")
+ ui.input_text("api_query", "Search Query:", placeholder="e.g., machine learning")
+ ui.input_numeric("api_max_results", "Max Results:", value=50, min=10, max=500, step=10)
+ ui.input_action_button("api_search_btn", "Search OpenAlex", icon=ICONS["play"])
+ ui.p("This will fetch data, apply standardization, and load it into the application.", style="color: gray; font-size: 10px;")
+
+ @reactive.effect
+ @reactive.event(input.api_search_btn)
+ def execute_api_search():
+ query = input.api_query()
+ max_results = input.api_max_results()
+
+ if not query:
+ ui.notification_show("⚠️ Please enter a search query.", duration=5, type="warning")
+ return
+
+ ui.modal_show(create_loading_modal("API data"))
+
+ try:
+ # 1. Extract
+ retriever = OpenAlexRetriever()
+ raw_data = retriever.fetch(query, max_results=max_results)
+
+ if not raw_data:
+ ui.notification_show("⚠️ No results found.", duration=5, type="warning")
+ return
+
+ # 2. Transform (Standardize)
+ standardizer = OpenAlexStandardizer()
+ standardized_df = standardizer.standardize(raw_data)
+
+ # 3. Load
+ df.set(standardized_df)
+ reset_all_analyses()
+
+ ui.notification_show(f"✅ Successfully loaded {len(standardized_df)} documents!", duration=5, type="message")
+ except Exception as e:
+ ui.notification_show(f"❌ Error during API extraction: {str(e)}", duration=10, type="error")
+ finally:
+ ui.modal_remove()
+
+ @render.express
+ def show_api_data_table():
+ data = df.get()
+ if data is not None and len(data) > 0 and 'DB' in data.columns and (data['DB'] == 'OPENALEX').any():
+ ui.h4("Preview of Standardized Data", style="color: #5567BB;")
+ ui.p(f"Showing the first {min(5, len(data))} rows:")
+ preview_df = data[['UT', 'TI', 'AU', 'PY', 'SO', 'SR']].head(5)
+ ui.HTML(preview_df.to_html(classes="table table-striped table-hover", index=False))
+ elif data is None:
+ ui.p("No data loaded via API yet. Use the sidebar to search OpenAlex.")
with ui.nav_panel("None", value="collections"):
ui.h3("🚧 Warning: Merge Collection is under construction 🚧")
@@ -8185,9 +8243,8 @@ def update_plot_settings():
# --- Sidebar Management ---
@render.express()
-@reactive.event(input.start_button)
def toggle_sidebar():
- with ui.tags.div(id="sidebar_2", class_="custom-sidebar"):
+ with ui.tags.div(id="sidebar_2", class_="custom-sidebar sidebar-hidden"):
with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False):
# Info Section
with ui.accordion_panel("Biblioshiny", icon=ICONS["home_colored"]):
@@ -8344,9 +8401,10 @@ def toggle_sidebar():
});
observer.observe(document.body, { childList: true, subtree: true });
- // Show both sidebars when 'start_button' is clicked
+ // Show both sidebars when 'start_button' or 'api_search_btn' is clicked
document.addEventListener("click", function(e) {
- if (e.target && e.target.id === "start_button") {
+ const btn = e.target.closest('button');
+ if (btn && (btn.id === "start_button" || btn.id === "api_search_btn")) {
setSidebarState(true);
}
});
diff --git a/execution_evidence.ipynb b/execution_evidence.ipynb
new file mode 100644
index 000000000..7307dabf5
--- /dev/null
+++ b/execution_evidence.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Bibliometrix-Python ETL Execution Evidence\n",
+ "\n",
+ "This notebook demonstrates the execution of the ETL pipeline for OpenAlex data. We will fetch data, standardize it, and validate the resulting DataFrame against the Bibliometrix schema."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "import os\n",
+ "\n",
+ "# Ensure the current directory is in the python path\n",
+ "sys.path.append(os.path.abspath('.'))\n",
+ "\n",
+ "from www.services.api_retriever import OpenAlexRetriever\n",
+ "from www.services.standardizer import OpenAlexStandardizer\n",
+ "from www.services.validator import validate_dataframe"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Phase 1: EXTRACT\n",
+ "Fetch data from OpenAlex API. We query for \"machine learning\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Querying OpenAlex for 'machine learning'...\n",
+ "Retrieved 10 works.\n"
+ ]
+ }
+ ],
+ "source": [
+ "retriever = OpenAlexRetriever(email=\"student@example.com\")\n",
+ "query = \"machine learning\"\n",
+ "print(f\"Querying OpenAlex for '{query}'...\")\n",
+ "\n",
+ "raw_data = retriever.fetch(query, max_results=10)\n",
+ "print(f\"Retrieved {len(raw_data)} works.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Phase 2-4: TRANSFORM\n",
+ "We pass the raw JSON data to the standardizer, which maps the fields, handles nulls, and calculates the Short Reference (SR)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Standardized DataFrame Shape: (10, 24)\n"
+ ]
+ }
+ ],
+ "source": [
+ "standardizer = OpenAlexStandardizer()\n",
+ "df = standardizer.standardize(raw_data)\n",
+ "\n",
+ "print(f\"Standardized DataFrame Shape: {df.shape}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Phase 5: VALIDATION\n",
+ "We run the validation module to ensure that all mandatory columns exist, no nulls remain, and multi-value fields are typed as lists."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Success! DataFrame matches target schema and has passed validation.\n"
+ ]
+ }
+ ],
+ "source": [
+ "is_valid = validate_dataframe(df)\n",
+ "if is_valid:\n",
+ " print(\"Success! DataFrame matches target schema and has passed validation.\")\n",
+ "else:\n",
+ " print(\"Validation failed.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Output Preview\n",
+ "Finally, we preview the first few rows of the standardized data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DB | \n",
+ " UT | \n",
+ " DI | \n",
+ " PMID | \n",
+ " TI | \n",
+ " SO | \n",
+ " JI | \n",
+ " PY | \n",
+ " DT | \n",
+ " LA | \n",
+ " TC | \n",
+ " AU | \n",
+ " AF | \n",
+ " C1 | \n",
+ " RP | \n",
+ " CR | \n",
+ " DE | \n",
+ " ID | \n",
+ " AB | \n",
+ " VL | \n",
+ " IS | \n",
+ " BP | \n",
+ " EP | \n",
+ " SR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " OPENALEX | \n",
+ " https://openalex.org/W2101234009 | \n",
+ " 10.48550/arxiv.1201.0490 | \n",
+ " | \n",
+ " Scikit-learn: Machine Learning in Python | \n",
+ " arXiv (Cornell University) | \n",
+ " Cornell University | \n",
+ " 2012 | \n",
+ " preprint | \n",
+ " en | \n",
+ " 63665 | \n",
+ " [Pedregosa, F., Varoquaux, G., Gramfort, A., M... | \n",
+ " [Fabián Pedregosa, Gaël Varoquaux, Alexandre G... | \n",
+ " [Commissariat à l'Énergie Atomique et aux Éner... | \n",
+ " | \n",
+ " [https://openalex.org/W1496508106, https://ope... | \n",
+ " [Python (programming language), Documentation,... | \n",
+ " [Python (programming language), Documentation,... | \n",
+ " Scikit-learn is a Python module integrating a ... | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " OPENALEX | \n",
+ " https://openalex.org/W3023540311 | \n",
+ " 10.5860/choice.27-0936 | \n",
+ " | \n",
+ " Genetic algorithms in search, optimization, an... | \n",
+ " Choice Reviews Online | \n",
+ " Association of College and Research Libraries | \n",
+ " 1989 | \n",
+ " article | \n",
+ " en | \n",
+ " 49332 | \n",
+ " [] | \n",
+ " [] | \n",
+ " [] | \n",
+ " | \n",
+ " [] | \n",
+ " [Computer science, Artificial intelligence, Ma... | \n",
+ " [Computer science, Artificial intelligence, Ma... | \n",
+ " From the Publisher:\\r\\nThis book brings togeth... | \n",
+ " 27 | \n",
+ " 02 | \n",
+ " 27 | \n",
+ " 0936 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DB UT DI PMID \\\n",
+ "0 OPENALEX https://openalex.org/W2101234009 10.48550/arxiv.1201.0490 \n",
+ "1 OPENALEX https://openalex.org/W3023540311 10.5860/choice.27-0936 \n",
+ "\n",
+ " TI \\\n",
+ "0 Scikit-learn: Machine Learning in Python \n",
+ "1 Genetic algorithms in search, optimization, an... \n",
+ "\n",
+ " SO JI \\\n",
+ "0 arXiv (Cornell University) Cornell University \n",
+ "1 Choice Reviews Online Association of College and Research Libraries \n",
+ "\n",
+ " PY DT LA TC \\\n",
+ "0 2012 preprint en 63665 \n",
+ "1 1989 article en 49332 \n",
+ "\n",
+ " AU \\\n",
+ "0 [Pedregosa, F., Varoquaux, G., Gramfort, A., M... \n",
+ "1 [] \n",
+ "\n",
+ " AF \\\n",
+ "0 [Fabián Pedregosa, Gaël Varoquaux, Alexandre G... \n",
+ "1 [] \n",
+ "\n",
+ " C1 RP \\\n",
+ "0 [Commissariat à l'Énergie Atomique et aux Éner... \n",
+ "1 [] \n",
+ "\n",
+ " CR \\\n",
+ "0 [https://openalex.org/W1496508106, https://ope... \n",
+ "1 [] \n",
+ "\n",
+ " DE \\\n",
+ "0 [Python (programming language), Documentation,... \n",
+ "1 [Computer science, Artificial intelligence, Ma... \n",
+ "\n",
+ " ID \\\n",
+ "0 [Python (programming language), Documentation,... \n",
+ "1 [Computer science, Artificial intelligence, Ma... \n",
+ "\n",
+ " AB VL IS BP EP SR \n",
+ "0 Scikit-learn is a Python module integrating a ... \n",
+ "1 From the Publisher:\\r\\nThis book brings togeth... 27 02 27 0936 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "pd.set_option('display.max_columns', None)\n",
+ "df.head(2)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/www/services/__init__.py b/www/services/__init__.py
index 28584e105..8163dc1de 100644
--- a/www/services/__init__.py
+++ b/www/services/__init__.py
@@ -1,3 +1,4 @@
+from .api_retriever import *
from .biblionetwork import *
from .cocmatrix import *
from .couplingmap import *
@@ -11,7 +12,9 @@
from .parsers import *
from .plotlydownload import *
from .savereport import *
+from .standardizer import *
from .tabletag import *
from .termextraction import *
from .thematicmap import *
-from .utils import *
\ No newline at end of file
+from .utils import *
+from .validator import *
\ No newline at end of file
diff --git a/www/services/api_retriever.py b/www/services/api_retriever.py
new file mode 100644
index 000000000..c64db2527
--- /dev/null
+++ b/www/services/api_retriever.py
@@ -0,0 +1,85 @@
+import requests
+import time
+
+class OpenAlexRetriever:
+ """
+ Extract Phase: OpenAlex API Retriever.
+
+ This class is responsible for connecting to the OpenAlex REST API, bypassing the
+ need for manual CSV downloads. It automates the data extraction process by
+ handling HTTP requests, pagination, and rate-limiting (retries) dynamically.
+ """
+ BASE_URL = "https://api.openalex.org/works"
+
+ def __init__(self, email: str = "example@example.com"):
+ """
+ Initializes the retriever and sets up the polite pool.
+
+ Args:
+ email (str): An email address used to access OpenAlex's polite pool
+ for faster response times and better rate limits.
+ """
+ self.email = email
+ self.session = requests.Session()
+
+ # Adding email to the User-Agent registers the request with the polite pool
+ self.session.headers.update({"User-Agent": f"mailto:{self.email}"})
+
+ def fetch(self, query: str, max_results: int = 100) -> list:
+ """
+ Fetches metadata from OpenAlex for a given textual query.
+
+ This method fully automates extraction by looping through paginated results
+ until the desired max_results limit is reached.
+
+ Args:
+ query (str): The search term (e.g., "machine learning").
+ max_results (int): The maximum number of documents to retrieve.
+
+ Returns:
+ list: A list of dictionaries, where each dictionary is a raw OpenAlex document.
+ """
+ results = []
+ # OpenAlex allows a maximum of 200 per page, but we use 50 to ensure stable loads
+ per_page = min(50, max_results)
+ page = 1
+
+ while len(results) < max_results:
+ params = {
+ "search": query,
+ "per-page": per_page,
+ "page": page
+ }
+
+ # Rate limit and network error handling
+ retries = 3
+ for attempt in range(retries):
+ response = self.session.get(self.BASE_URL, params=params)
+
+ if response.status_code == 200:
+ data = response.json()
+ works = data.get("results", [])
+
+ if not works:
+ return results # No more results available in the database
+
+ results.extend(works)
+ break # Success, break out of retry loop
+
+ elif response.status_code == 429:
+ print(f"[Warning] Rate limited by OpenAlex. Retrying in {2 ** attempt} seconds...")
+ time.sleep(2 ** attempt)
+ else:
+ print(f"[Error] API Error {response.status_code}: {response.text}")
+ break # Stop retrying on permanent errors
+
+ page += 1
+ # Rate limit handling: Sleep slightly to respect polite pool limits
+ time.sleep(0.1)
+
+ # Truncate if we fetched slightly more than max_results due to page sizes
+ if len(results) >= max_results:
+ results = results[:max_results]
+ break
+
+ return results
diff --git a/www/services/standardizer.py b/www/services/standardizer.py
new file mode 100644
index 000000000..db7f1febc
--- /dev/null
+++ b/www/services/standardizer.py
@@ -0,0 +1,161 @@
+import pandas as pd
+from www.services.format_functions import format_sr_column
+
+class OpenAlexStandardizer:
+ """
+ Phase 2 & 4: Transform & Calculate Fields (Standardizer).
+
+ This class handles the Transformation phase of the ETL pipeline. It maps the
+ proprietary, deeply-nested JSON structure returned by the OpenAlex API into
+ the flat, strict Web of Science (WoS) format required by Bibliometrix-Python.
+
+ It implements the 'Lookup Strategy' to map column names and enforce Data Types.
+ """
+
+ @staticmethod
+ def _reconstruct_abstract(inverted_index: dict) -> str:
+ """
+ OpenAlex abstracts are provided as inverted indices (for copyright reasons).
+ This helper parses the inverted index dictionary and reconstructs the full
+ abstract string.
+ """
+ if not inverted_index:
+ return ""
+ # The inverted index maps words to list of positions
+ # e.g. {"The": [0], "quick": [1], ...}
+ # Find the max position
+ max_pos = max([pos for positions in inverted_index.values() for pos in positions], default=-1)
+ if max_pos == -1:
+ return ""
+
+ words = [""] * (max_pos + 1)
+ for word, positions in inverted_index.items():
+ for pos in positions:
+ words[pos] = word
+ return " ".join(words)
+
+ @staticmethod
+ def _format_authors(authorships: list) -> tuple:
+ """Returns (AU list, AF list)"""
+ au = []
+ af = []
+ for authorship in authorships:
+ author = authorship.get("author", {})
+ name = author.get("display_name", "")
+ if not name:
+ continue
+
+ af.append(name)
+
+ # Convert to "Surname, Initials"
+ parts = name.split()
+ if len(parts) > 1:
+ surname = parts[-1]
+ initials = " ".join([p[0].upper() + "." for p in parts[:-1]])
+ au.append(f"{surname}, {initials}")
+ else:
+ au.append(f"{name},")
+
+ return au, af
+
+ @staticmethod
+ def _format_affiliations(authorships: list) -> list:
+ affiliations = []
+ for authorship in authorships:
+ institutions = authorship.get("institutions", [])
+ for inst in institutions:
+ inst_name = inst.get("display_name", "")
+ if inst_name and inst_name not in affiliations:
+ affiliations.append(inst_name)
+ return affiliations
+
+ def standardize(self, raw_data: list) -> pd.DataFrame:
+ """
+ Maps raw OpenAlex JSON items to WoS Standard Schema.
+ """
+ records = []
+
+ for item in raw_data:
+ # Multi-value field processing
+ au, af = self._format_authors(item.get("authorships", []))
+ c1 = self._format_affiliations(item.get("authorships", []))
+
+ cr = []
+ for ref in item.get("referenced_works", []):
+ cr.append(str(ref))
+
+ de = [kw.get("display_name") for kw in item.get("keywords", [])]
+ id_kw = [c.get("display_name") for c in item.get("concepts", [])]
+
+ # Abstract
+ abstract = ""
+ if "abstract_inverted_index" in item and item["abstract_inverted_index"]:
+ abstract = self._reconstruct_abstract(item["abstract_inverted_index"])
+
+ biblio = item.get("biblio", {}) or {}
+
+ pmid = ""
+ ids = item.get("ids", {})
+ if "pmid" in ids:
+ pmid = ids["pmid"].split("/")[-1]
+
+ source_info = item.get("primary_location", {}).get("source", {}) or {}
+
+ record = {
+ "DB": "OPENALEX",
+ "UT": str(item.get("id", "")),
+ "DI": str(item.get("doi", "") or "").replace("https://doi.org/", ""),
+ "PMID": pmid,
+ "TI": str(item.get("title", "") or ""),
+ "SO": str(source_info.get("display_name", "") or ""),
+ "JI": str(source_info.get("host_organization_name", "") or ""),
+ "PY": str(item.get("publication_year", "") or ""),
+ "DT": str(item.get("type", "") or ""),
+ "LA": str(item.get("language", "") or ""),
+ "TC": int(item.get("cited_by_count", 0) or 0),
+ "AU": au,
+ "AF": af,
+ "C1": c1,
+ "RP": "",
+ "CR": cr,
+ "DE": de,
+ "ID": id_kw,
+ "AB": abstract,
+ "VL": str(biblio.get("volume", "") or ""),
+ "IS": str(biblio.get("issue", "") or ""),
+ "BP": str(biblio.get("first_page", "") or ""),
+ "EP": str(biblio.get("last_page", "") or "")
+ }
+ records.append(record)
+
+ df = pd.DataFrame(records)
+
+ # Convert PY to numeric for plotting (Annual Scientific Production expects numbers)
+ df['PY'] = pd.to_numeric(df['PY'], errors='coerce')
+
+ # Calculate SR using the existing function
+ df['SR'] = df.apply(self._calculate_sr, axis=1)
+
+ return df
+
+ def _calculate_sr(self, row: pd.Series) -> str:
+ """
+ Invokes the existing format_sr_column function from Bibliometrix-Python
+ by mocking the raw Web of Science format.
+ """
+ # format_sr_column expects a Web_of_Science raw entry format where fields are lists.
+ # It reads AU, PY, and SO.
+ au_raw = [row['AU'][0]] if row['AU'] else ["Unknown, U."]
+ py_raw = [row['PY']] if row['PY'] else [""]
+ so_raw = [row['SO']] if row['SO'] else [""]
+
+ dummy_entry = {
+ 'AU': au_raw,
+ 'PY': py_raw,
+ 'SO': so_raw
+ }
+
+ try:
+ return format_sr_column(dummy_entry, 'Web_of_Science', '.txt')
+ except Exception as e:
+ return ""
diff --git a/www/services/validator.py b/www/services/validator.py
new file mode 100644
index 000000000..64fad4a24
--- /dev/null
+++ b/www/services/validator.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import numpy as np
+
+MANDATORY_COLUMNS = [
+ 'DB', 'UT', 'DI', 'PMID', 'TI', 'SO', 'JI', 'PY', 'DT', 'LA', 'TC',
+ 'AU', 'AF', 'C1', 'RP', 'CR', 'DE', 'ID', 'AB', 'VL', 'IS', 'BP', 'EP', 'SR'
+]
+
+MULTI_VALUE_COLUMNS = ['AU', 'AF', 'C1', 'CR', 'DE', 'ID']
+
+def validate_dataframe(df: pd.DataFrame) -> bool:
+ """
+ Phase 5: Validation.
+
+ This function programmatically verifies the DataFrame before it is finalized
+ and pushed to the Shiny frontend. It guarantees that the dataset conforms
+ strictly to the Type Contracts defined in the project specifications.
+
+ Validations performed:
+ 1. Existence: All mandatory 2- and 3-letter WoS Field Tags must exist.
+ 2. Null Handling: Pandas NaN or Python None values are NOT permitted.
+ 3. Type Contracts: Multi-value columns (like Authors, Affiliations) must
+ be rigorously typed as Python lists of strings (list[str]).
+
+ Args:
+ df (pd.DataFrame): The standardized DataFrame to check.
+
+ Returns:
+ bool: True if the DataFrame perfectly matches the target schema, False otherwise.
+ """
+ is_valid = True
+
+ # 1. Check for all mandatory columns (Existence)
+ missing_cols = [col for col in MANDATORY_COLUMNS if col not in df.columns]
+ if missing_cols:
+ print(f"[Validation Error] Missing mandatory columns: {missing_cols}")
+ is_valid = False
+
+ # 2. Check for NaN/None values (Null Handling)
+ if df.isnull().values.any():
+ print("[Validation Error] NaN or None values found in the DataFrame. These are not permitted.")
+ is_valid = False
+
+ # 3. Check types for Multi-value fields (Type Contracts)
+ for col in MULTI_VALUE_COLUMNS:
+ if col in df.columns:
+ # Check if all elements in this multi-value column are strictly lists
+ non_list_mask = df[col].apply(lambda x: not isinstance(x, list))
+ if non_list_mask.any():
+ print(f"[Validation Error] Type Contract violation: Column '{col}' contains non-list elements.")
+ is_valid = False
+
+ return is_valid