Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__pycache__/
bibliovenv/
Bibenv/
.idea/
.idea/
env/
142 changes: 137 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,7 @@ def select_db():
def mostra():
database = get_database(input)
ui.update_sidebar("sidebar_load_data", show=False)
sidebar_needs_update.set(sidebar_needs_update.get() + 1)
ui.update_action_button("export_button", disabled=False)
ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Data of {database}</h3>")

Expand Down Expand Up @@ -853,8 +854,135 @@ def indicator_types_ui_all():
"""
),


with ui.nav_panel("None", value="API"):
ui.h3("🚧 Warning: API is under construction 🚧")
ui.h3("🔎 API Data Retrieval", style="color: #5567BB;")
ui.p("Fetch bibliographic data directly from public APIs without downloading files.")

with ui.navset_card_tab():
# OpenAlex Sub-Tab
with ui.nav_panel("OpenAlex Data Collection"):
with ui.layout_sidebar(fillable=False, fill=False):
with ui.sidebar(position="right"):
ui.h5("OpenAlex Options", style="color: #5567BB;")
ui.input_select("oa_search_field", "Search Field:", {"title_abstract": "Title and Abstract", "title": "Title", "author": "Author"})
ui.input_text("oa_query", "Search Query:", value='machine learning')
ui.input_numeric("oa_max_records", "Max Records:", value=100, min=10, max=5000)
ui.input_numeric("oa_year_from", "Year From (optional):", value=None)
ui.input_numeric("oa_year_to", "Year To (optional):", value=None)
ui.input_action_button("openalex_fetch", "Fetch from OpenAlex", icon=ICONS["play"], class_="btn-primary")
ui.p("Fetches records via OpenAlex REST API with pagination.", style="color: gray; font-size: 11px;")

@render.express()
@reactive.event(input.openalex_fetch)
def handle_openalex():
query = input.oa_query()
max_res = input.oa_max_records()
year_from = input.oa_year_from()
year_to = input.oa_year_to()
search_field = input.oa_search_field()

ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Retrieving from OpenAlex...</h3>")

try:
from www.services.api_retriever import api_etl_pipeline
from functions.get_table import get_table, init_itables

standardised = api_etl_pipeline(
"OPENALEX",
query,
max_results=max_res,
from_year=year_from,
to_year=year_to,
search_field=search_field,
)
if len(standardised) > 0:
df.set(standardised)
reset_all_analyses()
ui.p(f"✅ Successfully retrieved and standardized {len(standardised)} records from OpenAlex.", style="color: green; text-align:center; font-weight: bold;")
ui.p("Your data is ready for analysis. The quality report is shown below:", style="text-align:center;")

# Render the completeness table exactly like the import tab!
ui.HTML(init_itables())
table_ui, _, _ = get_table("OpenAlex", df)
table_ui

sidebar_needs_update.set(sidebar_needs_update.get() + 1)

ui.notification_show("Data loaded! Check the left sidebar for analysis tools.", type="message", duration=10)
ui.div(
ui.h5("Ready for Analysis!", style="color: #5567BB;"),
ui.p("You can now click on the side menu options (e.g. 'Dataset' -> 'Main Information') to start exploring."),
style="text-align:center; margin-top: 30px; padding: 20px; border: 2px dashed #5567BB; border-radius: 10px;"
)
else:
ui.p(f"⚠️ No results found for query: '{query}'.", style="color: orange; text-align:center;")
except Exception as e:
ui.div(
ui.h5("Error during API retrieval:", style="color: red;"),
ui.p(str(e), style="color: red;")
)

# PubMed Sub-Tab
with ui.nav_panel("PubMed Data Collection"):
with ui.layout_sidebar(fillable=False, fill=False):
with ui.sidebar(position="right"):
ui.h5("PubMed Options", style="color: #5567BB;")
ui.input_select("pubmed_search_field", "Search Field:", {"title_abstract": "Title and Abstract", "title": "Title", "author": "Author"})
ui.input_text("pubmed_query", "Search Query:", value="machine learning")
ui.input_numeric("pubmed_max_results", "Max Records:", value=100, min=10, max=5000)
ui.input_numeric("pubmed_year_from", "Year From (optional):", value=None)
ui.input_numeric("pubmed_year_to", "Year To (optional):", value=None)
ui.input_action_button("pubmed_fetch", "Fetch from PubMed", icon=ICONS["play"], class_="btn-primary")
ui.p("Fetches records via NCBI E-utilities (Two-Phase Pagination).", style="color: gray; font-size: 11px;")

@render.express()
@reactive.event(input.pubmed_fetch)
def handle_pubmed():
query = input.pubmed_query()
max_res = input.pubmed_max_results()
year_from = input.pubmed_year_from()
year_to = input.pubmed_year_to()
search_field = input.pubmed_search_field()

ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Retrieving from PubMed...</h3>")

try:
from www.services.api_retriever import api_etl_pipeline
from functions.get_table import get_table, init_itables

standardised = api_etl_pipeline(
"PUBMED",
query,
max_results=max_res,
from_year=year_from,
to_year=year_to,
search_field=search_field,
)
if len(standardised) > 0:
df.set(standardised)
reset_all_analyses()
ui.p(f"✅ Successfully retrieved and standardized {len(standardised)} records from PubMed.", style="color: green; text-align:center; font-weight: bold;")

ui.HTML(init_itables())
table_ui, _, _ = get_table("PubMed", df)
table_ui

sidebar_needs_update.set(sidebar_needs_update.get() + 1)

ui.notification_show("Data loaded! Check the left sidebar for analysis tools.", type="message", duration=10)
ui.div(
ui.h5("Ready for Analysis!", style="color: #5567BB;"),
ui.p("You can now click on the side menu options (e.g. 'Dataset' -> 'Main Information') to start exploring."),
style="text-align:center; margin-top: 30px; padding: 20px; border: 2px dashed #5567BB; border-radius: 10px;"
)
else:
ui.p(f"⚠️ No results found for query: '{query}'.", style="color: orange; text-align:center;")
except Exception as e:
ui.div(
ui.h5("Error during API retrieval:", style="color: red;"),
ui.p(str(e), style="color: red;")
)

with ui.nav_panel("None", value="collections"):
ui.h3("🚧 Warning: Merge Collection is under construction 🚧")
Expand Down Expand Up @@ -8184,9 +8312,13 @@ def update_plot_settings():


# --- Sidebar Management ---
sidebar_needs_update = reactive.Value(0)

@render.express()
@reactive.event(input.start_button)
@reactive.event(sidebar_needs_update)
def toggle_sidebar():
if sidebar_needs_update.get() == 0:
return
with ui.tags.div(id="sidebar_2", class_="custom-sidebar"):
with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False):
# Info Section
Expand All @@ -8203,7 +8335,7 @@ def toggle_sidebar():
ui.input_action_button("go_filters", "Filters", class_="sidebar-button", icon=ICONS["filters"])

# Analysis Section
with ui.accordion_panel("Overview", icon=ICONS["play_colored"]):
with ui.accordion_panel("Dataset", icon=ICONS["play_colored"]):
ui.input_action_button("go_main", "Main Information", class_="sidebar-button", icon=ICONS["overview"])
ui.input_action_button("go_annual_scientific_production", "Annual Scientific Production", class_="sidebar-button", icon=ICONS["annual_growth_rate"])
ui.input_action_button("go_average_citations_per_year", "Average Citations per Year", class_="sidebar-button", icon=ICONS["average_citations_per_doc"])
Expand Down Expand Up @@ -8344,9 +8476,9 @@ def toggle_sidebar():
});
observer.observe(document.body, { childList: true, subtree: true });

// Show both sidebars when 'start_button' is clicked
// Show both sidebars when 'start_button' or API fetch buttons are clicked
document.addEventListener("click", function(e) {
if (e.target && e.target.id === "start_button") {
if (e.target && e.target.closest("#start_button, #openalex_fetch, #pubmed_fetch")) {
setSidebarState(true);
}
});
Expand Down
145 changes: 119 additions & 26 deletions functions/get_affiliationproductionovertime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,59 +3,152 @@

def get_affiliation_production_over_time(df, top_k_affiliations):
"""
Generate a plot of affiliation's production over time.
Generate a cumulative production line chart of the top affiliations over time,
aligned perfectly with the "Most Relevant Affiliations" metric.

Args:
df: A DataFrame object containing the data.
top_k_affiliations: The number of top affiliations to display.

Returns:
A Plotly figure object representing the affiliation's production over time.
fig: A Plotly figure object representing the affiliation's production over time.
aff_top_out: Table summarizing cumulative articles published per affiliation.
"""
data = df.get()

AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""])
# Force metaTagExtraction to run with aff_disamb=True to utilize the enhanced AU_UN columns
metaTagExtraction(df, "AU_UN", aff_disamb=True)
data = df.get()

# Ensure "PY" is numeric and valid (ignore years <= 1800)
data["PY"] = pd.to_numeric(data["PY"], errors="coerce")
data = data[data["PY"] > 1800]
data = data.dropna(subset=["PY", "AU_UN"])

import unicodedata

def strip_accents(s):
if not isinstance(s, str):
return s
return "".join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def split_affiliations(x):
if isinstance(x, list):
res = []
for item in x:
if isinstance(item, str):
res.extend([a.strip() for a in item.split(";") if a.strip()])
else:
res.append(str(item).strip())
return res
elif isinstance(x, str):
return [a.strip() for a in x.split(";") if a.strip()]
return []

AFF = data["AU_UN"].dropna().apply(split_affiliations)
# Filter out rows with empty affiliation lists
AFF = AFF[AFF.apply(len) > 0]
nAFF = [len(aff) for aff in AFF]

if len(AFF) == 0:
fig = go.Figure()
fig.update_layout(
annotations=[dict(text="No affiliation data available for this dataset",
x=0.5, y=0.5, showarrow=False, font=dict(size=16))],
plot_bgcolor='white', height=300
)
fig = go.FigureWidget(fig)
return fig, pd.DataFrame(columns=["Affiliation", "Year", "Articles"])

affiliations = [aff for sublist in AFF for aff in sublist]
years = data["PY"].repeat(nAFF).values[:len(affiliations)]
# Use only the PY values from the matching AFF index rows
years = data.loc[AFF.index, "PY"].repeat(nAFF).values[:len(affiliations)]

AFFY = pd.DataFrame({
"Affiliation": affiliations,
"Year": years
}).query('Affiliation != "NA"').dropna(subset=["Affiliation", "Year"])
})
AFFY["Affiliation"] = AFFY["Affiliation"].apply(strip_accents).str.strip().str.upper()

# Filter out non-reporting placeholder values to align with Most Relevant Affiliations
invalid_vals = ["", "NA", "NAN", "NONE", "NOTREPORTED", "NOTDECLARED"]
AFFY = AFFY[~AFFY["Affiliation"].isin(invalid_vals)].dropna(subset=["Affiliation", "Year"])

if len(AFFY) == 0:
fig = go.Figure()
fig.update_layout(
annotations=[dict(text="No affiliation data available for this dataset",
x=0.5, y=0.5, showarrow=False, font=dict(size=16))],
plot_bgcolor='white', height=300
)
fig = go.FigureWidget(fig)
return fig, pd.DataFrame(columns=["Affiliation", "Year", "Articles"])

AFFY = AFFY.groupby(["Affiliation", "Year"]).size().reset_index(name="Articles")
AFFY = AFFY.pivot(index="Affiliation", columns="Year", values="Articles").fillna(0)
AFFY = AFFY.stack().reset_index(name="Articles")
AFFY["Articles"] = AFFY.groupby("Affiliation")["Articles"].cumsum()
# Group by Affiliation and Year to calculate annual counts
AFFY_grouped = AFFY.groupby(["Affiliation", "Year"]).size().reset_index(name="Articles")

Affselected = AFFY[AFFY["Year"] == AFFY["Year"].max()].nlargest(top_k_affiliations, "Articles")
# Pivot to fill gaps in years with 0
AFFY_pivot = AFFY_grouped.pivot(index="Affiliation", columns="Year", values="Articles").fillna(0)
AFFY_stacked = AFFY_pivot.stack().reset_index(name="Articles")
AFFY_stacked["Year"] = AFFY_stacked["Year"].astype(int)

AffOverTime = AFFY[AFFY["Affiliation"].isin(Affselected["Affiliation"])]
AffOverTime["Year"] = AffOverTime["Year"].astype(int)
# Calculate Cumulative Sum of articles per affiliation over time
AFFY_stacked = AFFY_stacked.sort_values(by=["Affiliation", "Year"])
AFFY_stacked["Articles"] = AFFY_stacked.groupby("Affiliation")["Articles"].cumsum()

# Create the plot
# Select the top affiliations using the total cumulative sum in the final year (Most Relevant)
final_year = AFFY_stacked["Year"].max()
top_affs = AFFY_stacked[AFFY_stacked["Year"] == final_year].nlargest(top_k_affiliations, "Articles")["Affiliation"].tolist()

AffOverTime = AFFY_stacked[AFFY_stacked["Affiliation"].isin(top_affs)]

# CRITICAL FIX: Sort by BOTH Affiliation and Year chronologically to prevent Plotly line zig-zags!
AffOverTime = AffOverTime.sort_values(by=["Affiliation", "Year"])

# Convert Affiliation to Categorical to maintain ranking order in legend
AffOverTime["Affiliation"] = pd.Categorical(
AffOverTime["Affiliation"],
categories=top_affs,
ordered=True
)
AffOverTime = AffOverTime.sort_values(by=["Affiliation", "Year"])

# Create the beautiful cumulative line chart with markers
fig = px.line(
AffOverTime,
x="Year",
y="Articles",
color="Affiliation",
markers=True,
labels={"Year": "Year", "Articles": "Cumulative Articles", "Affiliation": "Affiliation"},
template="simple_white",
)

# Customize the layout
# Customize layout with clean gridlines and legend
unique_years = sorted(AffOverTime["Year"].unique())
dtick = 1
if len(unique_years) > 1:
year_range = unique_years[-1] - unique_years[0]
if year_range > 15:
dtick = 2

fig.update_layout(
height=600,
xaxis=dict(
tickmode='array',
tickvals=AffOverTime["Year"].unique()[::max(1, len(AffOverTime["Year"].unique()) // 20)]
title="Year",
showgrid=True,
gridcolor="#EFEFEF",
tickmode="linear",
dtick=dtick
),
yaxis=dict(
title="Cumulative N. of Articles",
showgrid=True,
gridcolor="#EFEFEF",
zeroline=False
),
yaxis_title="Cumulative Articles",
xaxis_title="Year",
plot_bgcolor='white',
title_font_size=24,
font=dict(color="#444444"),
margin=dict(l=40, r=40, t=40, b=40),
height=600,
margin=dict(l=50, r=50, t=50, b=50),
legend=dict(
title="Affiliation",
orientation="h",
Expand All @@ -67,11 +160,11 @@ def get_affiliation_production_over_time(df, top_k_affiliations):
)
)

# Customize the grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#EFEFEF')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#EFEFEF')
fig = go.FigureWidget(fig)
fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'],
'displaylogo': False}

return fig, AffOverTime
# Sort final dataframe for clean return/display
aff_top_out = AffOverTime.sort_values(by=["Year", "Affiliation"])

return fig, aff_top_out
Loading