PRAISELab-PicusLab · MohamedAliBadawy · May 29, 2026 · May 30, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 __pycache__/
 bibliovenv/
 Bibenv/
-.idea/
+.idea/
+env/
diff --git a/app.py b/app.py
@@ -743,6 +743,7 @@ def select_db():
                 def mostra():
                     database = get_database(input)
                     ui.update_sidebar("sidebar_load_data", show=False)
+                    sidebar_needs_update.set(sidebar_needs_update.get() + 1)
                     ui.update_action_button("export_button", disabled=False)
                     ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Data of {database}</h3>")
 
@@ -853,8 +854,135 @@ def indicator_types_ui_all():
                     """
                 ),
 
+
         with ui.nav_panel("None", value="API"):
-            ui.h3("🚧 Warning: API is under construction 🚧")
+            ui.h3("🔎 API Data Retrieval", style="color: #5567BB;")
+            ui.p("Fetch bibliographic data directly from public APIs without downloading files.")
+
+            with ui.navset_card_tab():
+                # OpenAlex Sub-Tab
+                with ui.nav_panel("OpenAlex Data Collection"):
+                    with ui.layout_sidebar(fillable=False, fill=False):
+                        with ui.sidebar(position="right"):
+                            ui.h5("OpenAlex Options", style="color: #5567BB;")
+                            ui.input_select("oa_search_field", "Search Field:", {"title_abstract": "Title and Abstract", "title": "Title", "author": "Author"})
+                            ui.input_text("oa_query", "Search Query:", value='machine learning')
+                            ui.input_numeric("oa_max_records", "Max Records:", value=100, min=10, max=5000)
+                            ui.input_numeric("oa_year_from", "Year From (optional):", value=None)
+                            ui.input_numeric("oa_year_to", "Year To (optional):", value=None)
+                            ui.input_action_button("openalex_fetch", "Fetch from OpenAlex", icon=ICONS["play"], class_="btn-primary")
+                            ui.p("Fetches records via OpenAlex REST API with pagination.", style="color: gray; font-size: 11px;")
+
+                        @render.express()
+                        @reactive.event(input.openalex_fetch)
+                        def handle_openalex():
+                            query = input.oa_query()
+                            max_res = input.oa_max_records()
+                            year_from = input.oa_year_from()
+                            year_to = input.oa_year_to()
+                            search_field = input.oa_search_field()
+
+                            ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Retrieving from OpenAlex...</h3>")
+
+                            try:
+                                from www.services.api_retriever import api_etl_pipeline
+                                from functions.get_table import get_table, init_itables
+
+                                standardised = api_etl_pipeline(
+                                    "OPENALEX",
+                                    query,
+                                    max_results=max_res,
+                                    from_year=year_from,
+                                    to_year=year_to,
+                                    search_field=search_field,
+                                )
+                                if len(standardised) > 0:
+                                    df.set(standardised)
+                                    reset_all_analyses()
+                                    ui.p(f"✅ Successfully retrieved and standardized {len(standardised)} records from OpenAlex.", style="color: green; text-align:center; font-weight: bold;")
+                                    ui.p("Your data is ready for analysis. The quality report is shown below:", style="text-align:center;")
+
+                                    # Render the completeness table exactly like the import tab!
+                                    ui.HTML(init_itables())
+                                    table_ui, _, _ = get_table("OpenAlex", df)
+                                    table_ui
+
+                                    sidebar_needs_update.set(sidebar_needs_update.get() + 1)
+
+                                    ui.notification_show("Data loaded! Check the left sidebar for analysis tools.", type="message", duration=10)
+                                    ui.div(
+                                        ui.h5("Ready for Analysis!", style="color: #5567BB;"),
+                                        ui.p("You can now click on the side menu options (e.g. 'Dataset' -> 'Main Information') to start exploring."),
+                                        style="text-align:center; margin-top: 30px; padding: 20px; border: 2px dashed #5567BB; border-radius: 10px;"
+                                    )
+                                else:
+                                    ui.p(f"⚠️ No results found for query: '{query}'.", style="color: orange; text-align:center;")
+                            except Exception as e:
+                                ui.div(
+                                    ui.h5("Error during API retrieval:", style="color: red;"),
+                                    ui.p(str(e), style="color: red;")
+                                )
+
+                # PubMed Sub-Tab
+                with ui.nav_panel("PubMed Data Collection"):
+                    with ui.layout_sidebar(fillable=False, fill=False):
+                        with ui.sidebar(position="right"):
+                            ui.h5("PubMed Options", style="color: #5567BB;")
+                            ui.input_select("pubmed_search_field", "Search Field:", {"title_abstract": "Title and Abstract", "title": "Title", "author": "Author"})
+                            ui.input_text("pubmed_query", "Search Query:", value="machine learning")
+                            ui.input_numeric("pubmed_max_results", "Max Records:", value=100, min=10, max=5000)
+                            ui.input_numeric("pubmed_year_from", "Year From (optional):", value=None)
+                            ui.input_numeric("pubmed_year_to", "Year To (optional):", value=None)
+                            ui.input_action_button("pubmed_fetch", "Fetch from PubMed", icon=ICONS["play"], class_="btn-primary")
+                            ui.p("Fetches records via NCBI E-utilities (Two-Phase Pagination).", style="color: gray; font-size: 11px;")
+
+                        @render.express()
+                        @reactive.event(input.pubmed_fetch)
+                        def handle_pubmed():
+                            query = input.pubmed_query()
+                            max_res = input.pubmed_max_results()
+                            year_from = input.pubmed_year_from()
+                            year_to = input.pubmed_year_to()
+                            search_field = input.pubmed_search_field()
+
+                            ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Retrieving from PubMed...</h3>")
+
+                            try:
+                                from www.services.api_retriever import api_etl_pipeline
+                                from functions.get_table import get_table, init_itables
+
+                                standardised = api_etl_pipeline(
+                                    "PUBMED",
+                                    query,
+                                    max_results=max_res,
+                                    from_year=year_from,
+                                    to_year=year_to,
+                                    search_field=search_field,
+                                )
+                                if len(standardised) > 0:
+                                    df.set(standardised)
+                                    reset_all_analyses()
+                                    ui.p(f"✅ Successfully retrieved and standardized {len(standardised)} records from PubMed.", style="color: green; text-align:center; font-weight: bold;")
+
+                                    ui.HTML(init_itables())
+                                    table_ui, _, _ = get_table("PubMed", df)
+                                    table_ui
+
+                                    sidebar_needs_update.set(sidebar_needs_update.get() + 1)
+
+                                    ui.notification_show("Data loaded! Check the left sidebar for analysis tools.", type="message", duration=10)
+                                    ui.div(
+                                        ui.h5("Ready for Analysis!", style="color: #5567BB;"),
+                                        ui.p("You can now click on the side menu options (e.g. 'Dataset' -> 'Main Information') to start exploring."),
+                                        style="text-align:center; margin-top: 30px; padding: 20px; border: 2px dashed #5567BB; border-radius: 10px;"
+                                    )
+                                else:
+                                    ui.p(f"⚠️ No results found for query: '{query}'.", style="color: orange; text-align:center;")
+                            except Exception as e:
+                                ui.div(
+                                    ui.h5("Error during API retrieval:", style="color: red;"),
+                                    ui.p(str(e), style="color: red;")
+                                )
 
         with ui.nav_panel("None", value="collections"):
             ui.h3("🚧 Warning: Merge Collection is under construction 🚧")
@@ -8184,9 +8312,13 @@ def update_plot_settings():
 
 
 # --- Sidebar Management ---
+sidebar_needs_update = reactive.Value(0)
+
 @render.express()
-@reactive.event(input.start_button)
+@reactive.event(sidebar_needs_update)
 def toggle_sidebar():
+    if sidebar_needs_update.get() == 0:
+        return
     with ui.tags.div(id="sidebar_2", class_="custom-sidebar"):
         with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False):
             # Info Section
@@ -8203,7 +8335,7 @@ def toggle_sidebar():
                 ui.input_action_button("go_filters", "Filters", class_="sidebar-button", icon=ICONS["filters"])
 
             # Analysis Section
-            with ui.accordion_panel("Overview", icon=ICONS["play_colored"]):
+            with ui.accordion_panel("Dataset", icon=ICONS["play_colored"]):
                 ui.input_action_button("go_main", "Main Information", class_="sidebar-button", icon=ICONS["overview"])
                 ui.input_action_button("go_annual_scientific_production", "Annual Scientific Production", class_="sidebar-button", icon=ICONS["annual_growth_rate"])
                 ui.input_action_button("go_average_citations_per_year", "Average Citations per Year", class_="sidebar-button", icon=ICONS["average_citations_per_doc"])
@@ -8344,9 +8476,9 @@ def toggle_sidebar():
     });
     observer.observe(document.body, { childList: true, subtree: true });
 
-    // Show both sidebars when 'start_button' is clicked
+    // Show both sidebars when 'start_button' or API fetch buttons are clicked
     document.addEventListener("click", function(e) {
-        if (e.target && e.target.id === "start_button") {
+        if (e.target && e.target.closest("#start_button, #openalex_fetch, #pubmed_fetch")) {
             setSidebarState(true);
         }
     });

diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py
@@ -3,59 +3,152 @@
 
 def get_affiliation_production_over_time(df, top_k_affiliations):
     """
-    Generate a plot of affiliation's production over time.
+    Generate a cumulative production line chart of the top affiliations over time,
+    aligned perfectly with the "Most Relevant Affiliations" metric.
 
     Args:
         df: A DataFrame object containing the data.
         top_k_affiliations: The number of top affiliations to display.  
 
     Returns:
-        A Plotly figure object representing the affiliation's production over time.
+        fig: A Plotly figure object representing the affiliation's production over time.
+        aff_top_out: Table summarizing cumulative articles published per affiliation.
     """
     data = df.get()
 
-    AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""])
+    # Force metaTagExtraction to run with aff_disamb=True to utilize the enhanced AU_UN columns
+    metaTagExtraction(df, "AU_UN", aff_disamb=True)
+    data = df.get()
+
+    # Ensure "PY" is numeric and valid (ignore years <= 1800)
+    data["PY"] = pd.to_numeric(data["PY"], errors="coerce")
+    data = data[data["PY"] > 1800]
+    data = data.dropna(subset=["PY", "AU_UN"])
+
+    import unicodedata
+
+    def strip_accents(s):
+        if not isinstance(s, str):
+            return s
+        return "".join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
+
+    def split_affiliations(x):
+        if isinstance(x, list):
+            res = []
+            for item in x:
+                if isinstance(item, str):
+                    res.extend([a.strip() for a in item.split(";") if a.strip()])
+                else:
+                    res.append(str(item).strip())
+            return res
+        elif isinstance(x, str):
+            return [a.strip() for a in x.split(";") if a.strip()]
+        return []
+
+    AFF = data["AU_UN"].dropna().apply(split_affiliations)
+    # Filter out rows with empty affiliation lists
+    AFF = AFF[AFF.apply(len) > 0]
     nAFF = [len(aff) for aff in AFF]
 
+    if len(AFF) == 0:
+        fig = go.Figure()
+        fig.update_layout(
+            annotations=[dict(text="No affiliation data available for this dataset",
+                            x=0.5, y=0.5, showarrow=False, font=dict(size=16))],
+            plot_bgcolor='white', height=300
+        )
+        fig = go.FigureWidget(fig)
+        return fig, pd.DataFrame(columns=["Affiliation", "Year", "Articles"])
+
     affiliations = [aff for sublist in AFF for aff in sublist]
-    years = data["PY"].repeat(nAFF).values[:len(affiliations)]
+    # Use only the PY values from the matching AFF index rows
+    years = data.loc[AFF.index, "PY"].repeat(nAFF).values[:len(affiliations)]
+
     AFFY = pd.DataFrame({
         "Affiliation": affiliations,
         "Year": years
-    }).query('Affiliation != "NA"').dropna(subset=["Affiliation", "Year"])
+    })
+    AFFY["Affiliation"] = AFFY["Affiliation"].apply(strip_accents).str.strip().str.upper()
+
+    # Filter out non-reporting placeholder values to align with Most Relevant Affiliations
+    invalid_vals = ["", "NA", "NAN", "NONE", "NOTREPORTED", "NOTDECLARED"]
+    AFFY = AFFY[~AFFY["Affiliation"].isin(invalid_vals)].dropna(subset=["Affiliation", "Year"])
+
+    if len(AFFY) == 0:
+        fig = go.Figure()
+        fig.update_layout(
+            annotations=[dict(text="No affiliation data available for this dataset",
+                            x=0.5, y=0.5, showarrow=False, font=dict(size=16))],
+            plot_bgcolor='white', height=300
+        )
+        fig = go.FigureWidget(fig)
+        return fig, pd.DataFrame(columns=["Affiliation", "Year", "Articles"])
 
-    AFFY = AFFY.groupby(["Affiliation", "Year"]).size().reset_index(name="Articles")
-    AFFY = AFFY.pivot(index="Affiliation", columns="Year", values="Articles").fillna(0)
-    AFFY = AFFY.stack().reset_index(name="Articles")
-    AFFY["Articles"] = AFFY.groupby("Affiliation")["Articles"].cumsum()
+    # Group by Affiliation and Year to calculate annual counts
+    AFFY_grouped = AFFY.groupby(["Affiliation", "Year"]).size().reset_index(name="Articles")
 
-    Affselected = AFFY[AFFY["Year"] == AFFY["Year"].max()].nlargest(top_k_affiliations, "Articles")
+    # Pivot to fill gaps in years with 0
+    AFFY_pivot = AFFY_grouped.pivot(index="Affiliation", columns="Year", values="Articles").fillna(0)
+    AFFY_stacked = AFFY_pivot.stack().reset_index(name="Articles")
+    AFFY_stacked["Year"] = AFFY_stacked["Year"].astype(int)
 
-    AffOverTime = AFFY[AFFY["Affiliation"].isin(Affselected["Affiliation"])]
-    AffOverTime["Year"] = AffOverTime["Year"].astype(int)
+    # Calculate Cumulative Sum of articles per affiliation over time
+    AFFY_stacked = AFFY_stacked.sort_values(by=["Affiliation", "Year"])
+    AFFY_stacked["Articles"] = AFFY_stacked.groupby("Affiliation")["Articles"].cumsum()
 
-    # Create the plot
+    # Select the top affiliations using the total cumulative sum in the final year (Most Relevant)
+    final_year = AFFY_stacked["Year"].max()
+    top_affs = AFFY_stacked[AFFY_stacked["Year"] == final_year].nlargest(top_k_affiliations, "Articles")["Affiliation"].tolist()
+
+    AffOverTime = AFFY_stacked[AFFY_stacked["Affiliation"].isin(top_affs)]
+
+    # CRITICAL FIX: Sort by BOTH Affiliation and Year chronologically to prevent Plotly line zig-zags!
+    AffOverTime = AffOverTime.sort_values(by=["Affiliation", "Year"])
+
+    # Convert Affiliation to Categorical to maintain ranking order in legend
+    AffOverTime["Affiliation"] = pd.Categorical(
+        AffOverTime["Affiliation"],
+        categories=top_affs,
+        ordered=True
+    )
+    AffOverTime = AffOverTime.sort_values(by=["Affiliation", "Year"])
+
+    # Create the beautiful cumulative line chart with markers
     fig = px.line(
         AffOverTime,
         x="Year",
         y="Articles",
         color="Affiliation",
+        markers=True,
         labels={"Year": "Year", "Articles": "Cumulative Articles", "Affiliation": "Affiliation"},
+        template="simple_white",
     )
 
-    # Customize the layout
+    # Customize layout with clean gridlines and legend
+    unique_years = sorted(AffOverTime["Year"].unique())
+    dtick = 1
+    if len(unique_years) > 1:
+        year_range = unique_years[-1] - unique_years[0]
+        if year_range > 15:
+            dtick = 2
+
     fig.update_layout(
+        height=600,
         xaxis=dict(
-            tickmode='array',
-            tickvals=AffOverTime["Year"].unique()[::max(1, len(AffOverTime["Year"].unique()) // 20)]
+            title="Year", 
+            showgrid=True, 
+            gridcolor="#EFEFEF",
+            tickmode="linear",
+            dtick=dtick
+        ),
+        yaxis=dict(
+            title="Cumulative N. of Articles", 
+            showgrid=True, 
+            gridcolor="#EFEFEF",
+            zeroline=False
         ),
-        yaxis_title="Cumulative Articles",
-        xaxis_title="Year",
         plot_bgcolor='white',
-        title_font_size=24,
-        font=dict(color="#444444"),
-        margin=dict(l=40, r=40, t=40, b=40),
-        height=600,
+        margin=dict(l=50, r=50, t=50, b=50),
         legend=dict(
             title="Affiliation",
             orientation="h",
@@ -67,11 +160,11 @@ def get_affiliation_production_over_time(df, top_k_affiliations):
         )
     )
 
-    # Customize the grid
-    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#EFEFEF')
-    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#EFEFEF')
     fig = go.FigureWidget(fig)
     fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'],
                                  'displaylogo': False}
 
-    return fig, AffOverTime
+    # Sort final dataframe for clean return/display
+    aff_top_out = AffOverTime.sort_values(by=["Year", "Affiliation"])
+
+    return fig, aff_top_out