-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
2219 lines (1875 loc) · 96.3 KB
/
utils.py
File metadata and controls
2219 lines (1875 loc) · 96.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import json
import re
import hashlib
import time
from functools import wraps
from getpass import getpass
from concurrent.futures import ThreadPoolExecutor, as_completed
# Core libraries
import boto3 # For AWS Comprehend Medical
import psycopg2 # For connecting to the local UMLS PostgreSQL DB
import requests # For making API calls to the LLM
from neo4j import GraphDatabase # The official Neo4j driver
from sentence_transformers import SentenceTransformer # For creating vector embeddings
# Recommended embedding models (in order of quality):
# 1. 'all-mpnet-base-v2' - Best balanced (768d, 420MB) ⭐ RECOMMENDED
# 2. 'BAAI/bge-large-en-v1.5' - Highest quality (1024d, 1.34GB)
# 3. 'BAAI/bge-small-en-v1.5' - Fast and good (384d, 133MB)
# 4. 'nomic-ai/nomic-embed-text-v1.5' - Great for scientific text (768d, 548MB)
# 5. 'all-MiniLM-L6-v2' - Fastest but lower quality (384d, 80MB) - CURRENT
# LlamaIndex components for text splitting
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
print("All libraries imported successfully.")
# STEP 2: CONFIGURATION & SECRETS
#
# Best Practice: Store secrets in environment variables or a .env file,
# not directly in the code. We use os.getenv() to read them.
# =============================================================================
# --- LLM Configuration ---
# Replace with your chosen LLM API endpoint and key
LLM_API_URL = os.getenv("LLM_API_URL", "YOUR_LLM_API_ENDPOINT_HERE")
# LLM_API_KEY = os.getenv("LLM_API_KEY", getpass("Enter your LLM API Key: "))
# --- AWS Configuration ---
# Your Boto3 client will automatically use credentials from your environment
# (e.g., from `aws configure` or IAM role).
AWS_REGION = os.getenv("AWS_REGION", "us-east-1") # e.g., 'us-east-1'
# --- Neo4j Configuration ---
# Start with your local Neo4j Desktop instance for development.
# The script will prompt if environment variables are not set.
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")
NEO4J_PASSWORD = "qwerty123"
# NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", getpass("Enter your Neo4j Password: "))
NEO4J_DATABASE = "neo4j" # Default database
# --- PostgreSQL (UMLS) Configuration ---
UMLS_DB_NAME = os.getenv("UMLS_DB_NAME", "umls")
UMLS_DB_USER = os.getenv("UMLS_DB_USER", "postgres")
UMLS_DB_PASSWORD = os.getenv("UMLS_DB_PASSWORD", "qwerty123") # UPDATE THIS
UMLS_DB_HOST = os.getenv("UMLS_DB_HOST", "localhost")
UMLS_DB_PORT = os.getenv("UMLS_DB_PORT", "5432")
# --- Source Data ---
SOURCE_DOCUMENT_PATH = "Biomedical_Knowledgebase.txt"
SOURCE_DOCUMENT_NAME = "Biomedical_Knowledgebase.txt"
print("Configuration loaded.")
# =============================================================================
# DOCUMENT CONTEXT & SPECIES CONFIGURATION
# =============================================================================
# Species-specific node types (species is part of entity identity)
SPECIES_SPECIFIC_NODE_TYPES = ['Gene', 'Protein', 'Anatomy', 'Cell_Type']
# Document context extraction prompt
DOCUMENT_CONTEXT_EXTRACTION_PROMPT = """
Analyze the beginning of this research paper and extract metadata in JSON format.
**TEXT (first 75 lines):**
{header_text}
**INSTRUCTIONS:**
Extract the following information:
1. **Bibliographic metadata:**
- title: Full paper title
- authors: Author list (format: "FirstAuthor, SecondAuthor, et al." - max 3 names)
- journal: Journal or publication name
- publication_year: Year only (YYYY format)
- doi: DOI if present, otherwise null
2. **Species information:**
- primary_species: Scientific name of PRIMARY organism studied
* Look in Abstract and Methods sections
* Examples: "Homo sapiens", "Mus musculus", "Rattus norvegicus"
* If human clinical/medical context with no explicit mention: "Homo sapiens (implied)"
* If computational/review with no specific organism: "not specified"
- species_confidence: "high" (explicitly stated), "medium" (implied from context), "low" (unclear)
- species_evidence: Brief quote showing where species was found (max 100 chars)
3. **Study type:**
- study_type: "clinical trial" | "animal study" | "in vitro" | "computational" | "review" | "case report" | "other"
**IMPORTANT:**
- Return ONLY valid JSON, no other text
- If a field cannot be determined, use "Unknown" for strings or null for optional fields
- Use exact scientific names for species (capitalize genus, lowercase species epithet)
Return JSON:
{{
"title": "string",
"authors": "string",
"journal": "string",
"publication_year": "YYYY",
"doi": "string or null",
"primary_species": "string",
"species_confidence": "high|medium|low",
"species_evidence": "string",
"study_type": "string"
}}
"""
print("Document context configuration loaded.")
# =============================================================================
# RETRY DECORATOR FOR TRANSIENT FAILURES
# =============================================================================
def retry_on_failure(max_retries=3, initial_delay=1.0, backoff_factor=2.0, exceptions=(Exception,)):
"""
Decorator to retry a function on failure with exponential backoff.
Args:
max_retries: Maximum number of retry attempts (default: 3)
initial_delay: Initial delay in seconds before first retry (default: 1.0)
backoff_factor: Multiplier for delay after each retry (default: 2.0)
exceptions: Tuple of exception types to catch and retry (default: all exceptions)
Returns:
Decorated function that retries on failure
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
delay = initial_delay
last_exception = None
for attempt in range(max_retries + 1):
try:
return func(*args, **kwargs)
except exceptions as e:
last_exception = e
if attempt < max_retries:
print(f" - ⚠️ Attempt {attempt + 1}/{max_retries + 1} failed: {str(e)[:100]}")
print(f" - 🔄 Retrying in {delay:.1f}s...")
time.sleep(delay)
delay *= backoff_factor
else:
print(f" - ❌ All {max_retries + 1} attempts failed")
raise last_exception
# Should never reach here, but just in case
raise last_exception
return wrapper
return decorator
# =============================================================================
# STEP 2.5: LLM INITIALIZATION
# Initialize AWS Bedrock LLM for entity extraction
# =============================================================================
def initialize_llm():
"""
Initialize AWS Bedrock LLM (Claude 3.7 Sonnet) for medical entity extraction.
Uses inference profile for on-demand access.
Returns a LlamaIndex LLM object.
"""
try:
from llama_index.llms.bedrock import Bedrock
# Use inference profile ID for on-demand access (not direct model ID)
llm = Bedrock(
model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", # Inference profile
region_name=AWS_REGION,
temperature=0.1,
max_tokens=8192, # Increased to handle large entity/relationship lists
context_size=200000, # Claude 3.7 Sonnet has 200K context window
additional_kwargs={
"top_p": 0.9,
}
)
print("✅ AWS Bedrock LLM (Claude 3.7 Sonnet) initialized successfully")
return llm
except Exception as e:
print(f"❌ Error initializing Bedrock LLM: {e}")
print("\n💡 Troubleshooting:")
print(" 1. Check AWS credentials: aws sts get-caller-identity")
print(" 2. Check Bedrock access in us-east-1 region")
print(" 3. Verify inference profile access:")
print(" aws bedrock list-inference-profiles --region us-east-1")
print(" 4. Model access may need to be enabled in Bedrock console")
raise RuntimeError(f"Could not initialize AWS Bedrock LLM: {e}")
print("LLM initialization function defined.")
def initialize_llm_lmstudio(base_url="http://127.0.0.1:1234/v1", model_name="qwen3-30b-a3b-2507"):
"""
Initialize LM Studio local LLM server for medical entity extraction.
Args:
base_url: LM Studio server URL (default: http://127.0.0.1:1234/v1)
model_name: Model identifier (for display/logging purposes)
Returns:
A wrapper object compatible with LlamaIndex LLM interface
"""
try:
from openai import OpenAI
# Create OpenAI client pointing to LM Studio
client = OpenAI(
base_url=base_url,
api_key="lm-studio" # Dummy key for local server
)
# Test connection
try:
models = client.models.list()
available_models = [m.id for m in models.data]
print(f"✅ LM Studio server connected: {base_url}")
print(f" Available models: {', '.join(available_models)}")
except Exception as e:
print(f"⚠️ Could not list models (server may be busy): {e}")
# Create a wrapper to make it compatible with LlamaIndex interface
class LMStudioLLM:
def __init__(self, client, model_name):
self.client = client
self.model_name = model_name
self.temperature = 0.7
self.max_tokens = 8192 # Match Claude's setting - prevents JSON truncation (Qwen3 supports up to 32K)
def complete(self, prompt, **kwargs):
"""Complete a prompt (LlamaIndex-compatible interface)"""
# Override defaults with any provided kwargs
temperature = kwargs.get('temperature', self.temperature)
max_tokens = kwargs.get('max_tokens', self.max_tokens)
try:
response = self.client.chat.completions.create(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
temperature=temperature,
max_tokens=max_tokens,
top_p=0.8,
# Note: top_k not supported by LM Studio's OpenAI-compatible API
)
# Create a response object with .text attribute
class Response:
def __init__(self, text):
self.text = text
return Response(response.choices[0].message.content)
except Exception as e:
print(f" - ❌ LM Studio API error: {e}")
raise
llm_wrapper = LMStudioLLM(client, model_name)
print(f"✅ LLM initialized successfully (LM Studio - {model_name})")
print(f" Server: {base_url}")
print(f" Settings: temp={llm_wrapper.temperature}, max_tokens={llm_wrapper.max_tokens}")
return llm_wrapper
except ImportError:
print("❌ OpenAI package not installed. Run: pip install openai")
raise
except Exception as e:
print(f"❌ Error initializing LM Studio LLM: {e}")
print(f" Make sure LM Studio server is running at {base_url}")
raise
print("LM Studio LLM initialization function defined.")
# =============================================================================
# DOCUMENT CONTEXT EXTRACTION FUNCTIONS
# =============================================================================
@retry_on_failure(max_retries=3, initial_delay=2.0)
def extract_document_context(file_path, source_id, llm):
"""
Extract complete document context (metadata + species) in single LLM call.
Args:
file_path: Path to the document text file
source_id: Unique identifier for the source (e.g., "PMC8675309")
llm: Initialized LLM instance
Returns:
dict: Complete document context with metadata and species information
"""
from datetime import datetime
from pathlib import Path
# Read first 75 lines from document
try:
with open(file_path, 'r', encoding='utf-8') as f:
header_lines = []
for _ in range(75):
line = f.readline()
if not line:
break
header_lines.append(line)
header_text = ''.join(header_lines)
except Exception as e:
print(f" ⚠️ Error reading file {file_path}: {e}")
raise
# Extract metadata via LLM
prompt = DOCUMENT_CONTEXT_EXTRACTION_PROMPT.format(header_text=header_text)
try:
response = llm.complete(prompt)
response_text = response.text if hasattr(response, 'text') else str(response)
# Clean response (remove markdown code blocks if present)
response_text = response_text.strip()
if response_text.startswith('```json'):
response_text = response_text[7:]
if response_text.startswith('```'):
response_text = response_text[3:]
if response_text.endswith('```'):
response_text = response_text[:-3]
response_text = response_text.strip()
metadata = json.loads(response_text)
# Add additional fields
metadata['source_id'] = source_id
metadata['source_type'] = 'research_article'
metadata['source_platform'] = 'PubMed Central'
metadata['processing_date'] = datetime.now().isoformat()
metadata['document_path'] = str(Path(file_path).absolute())
return metadata
except json.JSONDecodeError as e:
print(f" ⚠️ Failed to parse document context JSON: {e}")
print(f" Using safe defaults for {source_id}")
# Return safe defaults
return {
'source_id': source_id,
'title': 'Unknown',
'authors': 'Unknown',
'journal': 'Unknown',
'publication_year': 'Unknown',
'doi': None,
'primary_species': 'not specified',
'species_confidence': 'low',
'species_evidence': 'Unable to extract',
'study_type': 'other',
'source_type': 'research_article',
'source_platform': 'PubMed Central',
'processing_date': datetime.now().isoformat(),
'document_path': str(Path(file_path).absolute())
}
def apply_species_logic_to_node(node, document_context):
"""
Apply species handling rules to extracted nodes.
Species-specific node types get species in their identity.
Args:
node: Dict representing an extracted entity
document_context: Document metadata with species info
Returns:
dict: Node with correct species handling applied
"""
node_type = node.get('entity_type', '')
# Check if this node type is species-specific
if node_type in SPECIES_SPECIFIC_NODE_TYPES:
# Species IS part of identity
if 'species' not in node or not node['species']:
node['species'] = document_context['primary_species']
node['species_confidence'] = 'inherited'
# Mark that this node needs species in ontology_id (will be added after standardization)
node['_needs_species_suffix'] = True
else:
# Species is NOT part of identity - remove from node if present
node.pop('species', None)
node.pop('species_confidence', None)
node['_needs_species_suffix'] = False
return node
def apply_species_logic_to_relationship(rel, document_context):
"""
Ensure all relationships have species metadata.
Args:
rel: Dict representing an extracted relationship
document_context: Document metadata with species info
Returns:
dict: Relationship with species fields populated
"""
# Always require species in relationships
if 'species' not in rel or not rel['species']:
rel['species'] = document_context['primary_species']
rel['species_confidence'] = 'inherited'
# Validate species_confidence
if 'species_confidence' not in rel or not rel['species_confidence']:
rel['species_confidence'] = 'inherited'
return rel
print("Document context extraction functions defined.")
# =============================================================================
# STEP 3: DEFINE THE COMPREHENSIVE SCHEMA (v1.5)
# This schema will be used in our LLM prompt.
# =============================================================================
COMPREHENSIVE_SCHEMA = {
"node_types": [
# --- Clinical Concepts ---
"Disease", # A specific illness, disorder, or abnormal medical condition (e.g., "Hypertension", "ADHD"). Includes general conditions.
"Pathological_Finding", # An objective, structural or functional abnormality found via examination or testing (e.g., "Aortic aneurysm", "Gallstones").
"Symptom", # A subjective experience reported by a patient (e.g., "headache", "dizziness", "low self-esteem").
"Clinical_Finding", # An objective sign observed or measured by a clinician (e.g., "aggressive behavior", "high white blood cell count").
"Side_Effect", # An adverse reaction or unintended consequence of a medication or treatment (e.g., "insomnia", "nausea"). Includes complications.
# --- Interventions ---
"Medication", # A specific drug or pharmaceutical substance (e.g., "Ritalin", "Lisinopril").
"Treatment", # A therapeutic regimen or procedure that is not a drug (e.g., "psychiatric hospital", "acupuncture").
"Diagnostic_Procedure", # A test, scan, or method used to diagnose a condition (e.g., "Abdominal ultrasound", "biopsy").
"Medical_Device", # A physical tool or instrument used in a medical procedure (e.g., "ultrasound probe", "pacemaker").
# --- Biological & Genetic Concepts ---
"Anatomy", # A specific body part, organ, or physiological system (e.g., "liver", "abdominal aorta").
"Cell_Type", # A specific type of cell (e.g., "T cells", "Beta cells", "Neurons", "Hepatocytes").
"Pathogen", # An infectious agent that causes disease (e.g., "Bacillus anthracis").
"Gene", # A specific gene involved in biological processes or diseases (e.g., "NF2 gene").
"Protein", # A specific protein molecule.
"Genetic_Disorder", # A disease specifically caused by genetic abnormalities (e.g., "Achondroplasia").
"Biological_Process", # A physiological or cellular mechanism (e.g., "inflammation", "liver metabolism").
# --- Contextual & Epidemiological Concepts ---
"Clinical_Study", # A formal research investigation, trial, or study (e.g., "randomized controlled trial").
"Age_Group", # A specific patient population defined by age (e.g., "children", "elderly").
"Lifestyle_Factor", # A behavioral factor that influences health (e.g., "Smoking", "Alcohol consumption").
"Environmental_Factor", # An external, non-behavioral factor that influences health (e.g., "home environment", "asbestos exposure").
# --- Technology & Systems ---
"Technology", # Computational, AI, or technological methods used in healthcare (e.g., "Machine Learning", "Artificial Intelligence").
"Healthcare_System", # Healthcare delivery institutions or organizational units (e.g., "Emergency Department", "Basic Health Units").
"Health_Policy", # Government regulations, policies, or health programs (e.g., "National Emergency Care Policy").
# --- Social & Demographic ---
"Gender", # Biological sex or gender identity (e.g., "Male", "Female", "Non-binary"). Separated for clinical precision.
"Ethnicity", # Ethnic or racial background (e.g., "Hispanic", "Caucasian", "Asian", "African American").
"Demographic_Factor", # Other population characteristics not covered by Gender/Ethnicity (e.g., "Married", "Urban resident").
"Social_Program", # Government or institutional support programs (e.g., "Family Allowance", "Medicare").
"Social_Determinant", # Socioeconomic factors affecting health outcomes (e.g., "Poverty", "Education Level", "Income Inequality").
"Geographic_Location", # Specific places or areas relevant to health (e.g., "Urban Area", "Rural Setting", "Pelotas City").
# --- Measurement & Quantification ---
"Biomarker", # Measurable biological indicators (e.g., "HbA1c", "Blood pressure", "PSA", "CD4 count", "Troponin").
"Clinical_Outcome", # Study endpoints or clinical results (e.g., "Mortality", "Disease-free survival", "Remission", "Quality of life").
"Dosage", # Medication amounts and administration schedules (e.g., "10mg daily", "500mg twice daily", "Loading dose").
"Statistical_Measure", # Research metrics and statistical concepts (e.g., "Odds Ratio", "P-value", "Hazard Ratio", "Sensitivity").
],
"relationship_types": [
# --- Hierarchical & Definitional ---
"IS_A_TYPE_OF", # Creates a hierarchy between concepts (e.g., (:B-mode_ultrasound)-[:IS_A_TYPE_OF]->(:Ultrasound)).
"PRESENTS_AS", # Links a disease to its objective manifestation (e.g., (:Gallbladder_Disease)-[:PRESENTS_AS]->(:Gallstones)).
# --- Clinical Relationships ---
"HAS_SYMPTOM", # Connects a disease to a subjective symptom (e.g., (:Migraine)-[:HAS_SYMPTOM]->(:Headache)).
"HAS_FINDING", # Connects a disease to an objective clinical sign (e.g., (:Jaundice)-[:HAS_FINDING]->(:Yellow_Skin)).
"TREATED_BY", # Connects a disease to a medication or treatment that manages or cures it.
"PREVENTS", # Connects an intervention to a disease it can prevent (e.g., (:Vaccination)-[:PREVENTS]->(:Measles)).
"DIAGNOSED_BY", # Connects a disease to a procedure used to identify it.
"HAS_INDICATION", # Connects a procedure/treatment to the condition it is used for (e.g., (:Ultrasound)-[:HAS_INDICATION]->(:Abdominal_Pain)).
"HAS_CONTRAINDICATION", # Connects a procedure/treatment to a condition where it would be harmful.
"HAS_COMPLICATION", # Connects a treatment/procedure to a potential negative outcome.
"HAS_SIDE_EFFECT", # Connects a medication to a known adverse reaction.
"USES_MEDICATION", # Connects a treatment regimen to a specific drug it involves (e.g., (:Chemotherapy)-[:USES_MEDICATION]->(:Cisplatin)).
"USES_DEVICE", # Connects a procedure to a medical device it requires.
# --- Biological & Causal Relationships ---
"AFFECTS", # Connects a disease or process to the anatomy it impacts (e.g., (:Hepatitis)-[:AFFECTS]->(:Liver)).
"CAUSED_BY", # Connects a disease to its direct etiological cause (e.g., (:Anthrax)-[:CAUSED_BY]->(:Bacillus_anthracis)).
"INCREASES_RISK_FOR", # Connects a risk factor to a disease (e.g., (:Smoking)-[:INCREASES_RISK_FOR]->(:Lung_Cancer)).
"METABOLIZED_BY", # Connects a medication to a biological process (e.g., (:Acetaminophen)-[:METABOLIZED_BY]->(:Liver_Enzyme_Activity)).
"ASSOCIATED_WITH", # Connects a gene to a genetic disorder.
"CODES_FOR", # Connects a gene to the protein it creates.
# --- Contextual Relationships ---
"STUDIED_IN", # Connects a concept (like a drug or treatment) to the clinical study that investigated it.
"OCCURS_IN_AGE_GROUP", # Connects a disease to a specific age population.
# --- Technology & System Relationships ---
"UTILIZES", # Connects a study/system to the technology it employs (e.g., (:Study)-[:UTILIZES]->(:Machine_Learning)).
"PROVIDED_BY", # Connects a healthcare service to the system that delivers it (e.g., (:Emergency_Care)-[:PROVIDED_BY]->(:Hospital)).
"REGULATED_BY", # Connects a healthcare entity to the policy that governs it (e.g., (:Emergency_Services)-[:REGULATED_BY]->(:Health_Policy)).
# --- Social & Demographic Relationships ---
"ELIGIBLE_FOR", # Connects a demographic group to a program they can access (e.g., (:Elderly)-[:ELIGIBLE_FOR]->(:Medicare)).
"INFLUENCED_BY", # Connects a health outcome to a social determinant (e.g., (:Diabetes_Prevalence)-[:INFLUENCED_BY]->(:Poverty)).
"LOCATED_IN", # Connects an entity to its geographic location (e.g., (:Hospital)-[:LOCATED_IN]->(:Urban_Area)).
"OCCURS_MORE_IN", # Connects a disease to a gender/ethnicity with higher prevalence (e.g., (:Breast_Cancer)-[:OCCURS_MORE_IN]->(:Female)).
# --- Measurement & Quantification Relationships ---
"MEASURED_BY", # Connects a disease/condition to a biomarker used to measure it (e.g., (:Diabetes)-[:MEASURED_BY]->(:HbA1c)).
"INDICATES", # Connects a biomarker to what it signifies (e.g., (:Elevated_PSA)-[:INDICATES]->(:Prostate_Cancer)).
"MONITORED_BY", # Connects a treatment to a biomarker used to track response (e.g., (:Chemotherapy)-[:MONITORED_BY]->(:Tumor_Markers)).
"IMPROVES_OUTCOME", # Connects a treatment to a positive clinical outcome (e.g., (:Immunotherapy)-[:IMPROVES_OUTCOME]->(:Survival)).
"WORSENS_OUTCOME", # Connects a risk factor to a negative outcome (e.g., (:Smoking)-[:WORSENS_OUTCOME]->(:Mortality)).
"ADMINISTERED_AT", # Connects a medication to its dosage regimen (e.g., (:Aspirin)-[:ADMINISTERED_AT]->(:81mg_daily)).
"QUANTIFIED_BY", # Connects a relationship/finding to its statistical measure (e.g., (:Risk_Association)-[:QUANTIFIED_BY]->(:Odds_Ratio)).
]
}
print("Comprehensive schema defined.")
# =============================================================================
# STEP 4: DEFINE THE LLM EXTRACTION PROMPT
# =============================================================================
# =============================================================================
# CORRECTED AND IMPROVED LLM PROMPT TEMPLATE
# =============================================================================
# =============================================================================
# FINAL AND MOST ROBUST LLM PROMPT TEMPLATE (v1.2)
# =============================================================================
# =============================================================================
# FINAL AND MOST ROBUST LLM PROMPT TEMPLATE (v1.3)
# =============================================================================
EXTRACTION_PROMPT_TEMPLATE = """
-GOAL-
You are a world-class biomedical informatics expert. Your task is to act as a precision knowledge extraction engine from a given medical text document. Identify all relevant medical entities and their relationships according to the provided schema.
-DOCUMENT CONTEXT-
**Source:** {source_title}
**Journal:** {source_journal} ({source_year})
**Primary Species Studied:** {document_species}
**Study Type:** {study_type}
-SCHEMA DEFINITION-
Node Types: {node_types}
Relationship Types: {relationship_types}
-CRITICAL INSTRUCTIONS-
**ALWAYS expand medical abbreviations to their full terms:**
- Use complete medical terminology, never abbreviations
- Example: "MI" → "Myocardial Infarction"
- Example: "COPD" → "Chronic Obstructive Pulmonary Disease"
- Example: "BP" → "Blood Pressure"
- Example: "CT" → "Computed Tomography"
- Example: "MRI" → "Magnetic Resonance Imaging"
- If unsure about an abbreviation, use your best medical knowledge based on context
**SPECIES HANDLING RULES:**
For ENTITIES:
- For node types: Gene, Protein, Anatomy, Cell_Type → INCLUDE "species" field
Example: {{"entity_name": "TP53", "entity_type": "Gene", "species": "Homo sapiens", ...}}
- For node types: Drug, Disease, Treatment, Symptom, Medication, Biological_Process, Pathogen, etc. → DO NOT include "species" field
Example: {{"entity_name": "Aspirin", "entity_type": "Drug", ...}} ← No species field
For RELATIONSHIPS:
- ALWAYS include "species" and "species_confidence" fields
- Default species: {document_species}
- **species_confidence** options:
* "explicit": Species is directly mentioned in the text chunk
* "inherited": Species not mentioned in chunk, using document default ({document_species})
* "speculative": Discussing hypothetical cross-species implications
* "unknown": Cannot determine species
Examples:
- Chunk says "In mice, drug X reduced tumors" → species: "Mus musculus", species_confidence: "explicit"
- Chunk says "Drug X reduced tumors" (no species mentioned) → species: "{document_species}", species_confidence: "inherited"
- Chunk says "This may be applicable to humans" → species: "Homo sapiens", species_confidence: "speculative"
-EXTRACTION STEPS-
1. **Identify Entities:** Carefully read the text and identify all terms that match one of the node types in the schema. For each entity, you must extract:
- `entity_name`: **ALWAYS use the fully expanded medical term, never abbreviations**
- `entity_type`: The corresponding type from the schema's Node Types list.
- `entity_description`: A concise, one-sentence description of the entity based on its context in the text.
- `species`: ONLY for Gene, Protein, Anatomy, Cell_Type entities (see SPECIES HANDLING RULES above)
2. **Identify Relationships:** Identify all relationships between the entities you found. The relationship must match one of the types in the schema. For each relationship, you must extract:
- `source_entity_name`: The name of the source entity.
- `source_entity_type`: The type of the source entity.
- `target_entity_name`: The name of the target entity.
- `target_entity_type`: The type of the target entity.
- `relation_type`: The corresponding type from the schema's Relationship Types list.
- `relationship_description`: A concise, one-sentence explanation of the relationship based on the text.
- `species`: The species this relationship applies to (REQUIRED - see SPECIES HANDLING RULES)
- `species_confidence`: How certain the species assignment is (REQUIRED - see SPECIES HANDLING RULES)
-OUTPUT FORMATTING-
1. **CRITICAL:** Your entire response must be ONLY a single, valid JSON object. Do not include any introductory text, greetings, or markdown formatting like ```json.
2. The JSON object must have two primary keys: "entities" and "relationships".
3. The value for each key must be a list of JSON objects, where each object follows the structure defined in the EXTRACTION STEPS.
4. If no entities or relationships are found, return an empty list for the corresponding key.
-EXAMPLE OF THE EXACT JSON STRUCTURE REQUIRED-
```json
{{
"entities": [
{{"entity_name": "Aspirin", "entity_type": "Drug", "entity_description": "A common pain reliever and anti-inflammatory medication."}},
{{"entity_name": "TP53", "entity_type": "Gene", "species": "Homo sapiens", "entity_description": "A tumor suppressor gene that regulates cell division."}}
],
"relationships": [
{{"source_entity_name": "Aspirin", "source_entity_type": "Drug", "target_entity_name": "Inflammation", "target_entity_type": "Pathological_Finding", "relation_type": "TREATS", "relationship_description": "Aspirin reduces inflammation by inhibiting prostaglandin synthesis.", "species": "Homo sapiens", "species_confidence": "inherited"}}
]
}}
```
-MEDICAL TEXT TO ANALYZE-
{text_chunk}
-FINAL JSON OUTPUT-
"""
# We format the prompt with the schema details when we use it
# PROMPT_WITH_SCHEMA = EXTRACTION_PROMPT_TEMPLATE.format(
# node_types=', '.join(COMPREHENSIVE_SCHEMA['node_types']),
# relationship_types=', '.join(COMPREHENSIVE_SCHEMA['relationship_types'])
# )
# And in your process_text_chunk function, you'll now use it like this:
# prompt = PROMPT_WITH_SCHEMA.format(text_chunk=text_chunk)
print("LLM prompt template created.")
# =============================================================================
# STEP 5: THE ENRICHMENT PIPELINE
# These functions will perform the Standardization, Synonym, and Embedding steps.
# =============================================================================
# --- 5a. Standardization ---
# =============================================================================
# REVISED AND IMPROVED STANDARDIZATION FUNCTION
# =============================================================================
# Define which entity types should be processed by which AWS API.
# Now includes ALL entity types with educated guesses + fallback mechanism
ENTITY_TYPE_TO_API_MAP = {
# Clinical Concepts - SNOMED CT
"Disease": "snomed",
"Pathological_Finding": "snomed",
"Symptom": "snomed",
"Clinical_Finding": "snomed",
"Side_Effect": "snomed",
# Interventions - Mixed
"Medication": "rxnorm", # Definitely RxNorm
"Treatment": "snomed", # Medical procedures/therapies
"Diagnostic_Procedure": "snomed", # Tests, scans, etc.
"Medical_Device": "snomed", # Instruments, tools
# Biological & Genetic Concepts - SNOMED CT (best guess)
"Anatomy": "snomed",
"Cell_Type": "snomed", # Cellular biology (partial AWS match)
"Pathogen": "snomed", # Infectious organisms
"Gene": "snomed", # Best guess, might fallback
"Protein": "snomed", # Best guess, might fallback
"Genetic_Disorder": "snomed", # Hereditary conditions
"Biological_Process": "snomed", # Physiological processes
# Contextual Concepts - SNOMED CT (educated guesses)
"Clinical_Study": "snomed", # Research terminology
"Age_Group": "snomed", # Demographics in SNOMED
"Lifestyle_Factor": "snomed", # Behavioral factors
"Environmental_Factor": "snomed", # External factors
# Technology & Systems - Likely not in medical ontologies (will fallback to BIOGRAPH)
"Technology": "snomed", # Computational methods (unlikely AWS match)
"Healthcare_System": "snomed", # Organizational entities (unlikely AWS match)
"Health_Policy": "snomed", # Government regulations (unlikely AWS match)
# Social & Demographic - SNOMED CT (some may fallback)
"Gender": "snomed", # Biological sex (likely AWS match)
"Ethnicity": "snomed", # Ethnic groups (partial AWS match)
"Demographic_Factor": "snomed", # Other demographics
"Social_Program": "snomed", # Welfare programs (unlikely AWS match)
"Social_Determinant": "snomed", # Socioeconomic factors (partial match)
"Geographic_Location": "snomed", # Places (unlikely AWS match)
# Measurement & Quantification - Mixed
"Biomarker": "snomed", # Lab tests, vital signs (likely AWS match)
"Clinical_Outcome": "snomed", # Clinical results (partial AWS match)
"Dosage": "rxnorm", # Medication dosing (RxNorm likely better)
"Statistical_Measure": "snomed", # Research metrics (unlikely AWS match)
}
MIN_CONFIDENCE_SCORE = 0.68 # Lowered to capture borderline cases like "Tumors" (0.69) after format optimization
def generate_fallback_id(entity_name: str, entity_type: str) -> str:
"""Creates a deterministic, project-specific ID for unlinked entities."""
normalized_name = re.sub(r'[^a-z0-9]', '', entity_name.lower())
# Use a consistent hash function
hashed_id = hashlib.sha1(normalized_name.encode()).hexdigest()[:12]
return f"BIOGRAPH:{entity_type.upper()}:{hashed_id}"
# =============================================================================
# COMPREHENSIVE MEDICAL ABBREVIATION DICTIONARY
# 200+ common medical abbreviations for fallback expansion
# Note: LLM does primary expansion (context-aware), this is backup
# =============================================================================
ABBREVIATION_MAP = {
# ========== CARDIOVASCULAR ==========
"MI": "Myocardial Infarction",
"CHF": "Congestive Heart Failure",
"AF": "Atrial Fibrillation",
"AFib": "Atrial Fibrillation",
"CAD": "Coronary Artery Disease",
"PVD": "Peripheral Vascular Disease",
"DVT": "Deep Vein Thrombosis",
"PE": "Pulmonary Embolism",
"HTN": "Hypertension",
"HBP": "High Blood Pressure",
"CABG": "Coronary Artery Bypass Graft",
"PCI": "Percutaneous Coronary Intervention",
"STEMI": "ST-Elevation Myocardial Infarction",
"NSTEMI": "Non-ST-Elevation Myocardial Infarction",
"SVT": "Supraventricular Tachycardia",
"VT": "Ventricular Tachycardia",
"VF": "Ventricular Fibrillation",
# ========== RESPIRATORY ==========
"COPD": "Chronic Obstructive Pulmonary Disease",
"ARDS": "Acute Respiratory Distress Syndrome",
"URI": "Upper Respiratory Infection",
"URTI": "Upper Respiratory Tract Infection",
"LRTI": "Lower Respiratory Tract Infection",
"OSA": "Obstructive Sleep Apnea",
"TB": "Tuberculosis",
"CF": "Cystic Fibrosis",
"IPF": "Idiopathic Pulmonary Fibrosis",
"SOB": "Shortness of Breath",
"DOE": "Dyspnea on Exertion",
# ========== NEUROLOGICAL ==========
"CVA": "Cerebrovascular Accident",
"TIA": "Transient Ischemic Attack",
"ICH": "Intracranial Hemorrhage",
"SAH": "Subarachnoid Hemorrhage",
"MS": "Multiple Sclerosis",
"ALS": "Amyotrophic Lateral Sclerosis",
"PD": "Parkinson's Disease",
"AD": "Alzheimer's Disease",
"SCI": "Spinal Cord Injury",
"TBI": "Traumatic Brain Injury",
"CP": "Cerebral Palsy",
"LOC": "Loss of Consciousness",
"AMS": "Altered Mental Status",
# ========== PSYCHIATRIC ==========
"ADHD": "Attention-Deficit Hyperactivity Disorder",
"OCD": "Obsessive-Compulsive Disorder",
"PTSD": "Post-Traumatic Stress Disorder",
"GAD": "Generalized Anxiety Disorder",
"MDD": "Major Depressive Disorder",
"BPD": "Borderline Personality Disorder",
"SAD": "Seasonal Affective Disorder",
"BD": "Bipolar Disorder",
"CD": "Conduct Disorder",
"ODD": "Oppositional Defiant Disorder",
# ========== GASTROINTESTINAL ==========
"GERD": "Gastroesophageal Reflux Disease",
"IBD": "Inflammatory Bowel Disease",
"IBS": "Irritable Bowel Syndrome",
"UC": "Ulcerative Colitis",
"PUD": "Peptic Ulcer Disease",
"NASH": "Non-Alcoholic Steatohepatitis",
"NAFLD": "Non-Alcoholic Fatty Liver Disease",
"GI": "Gastrointestinal",
"N/V": "Nausea and Vomiting",
"LFT": "Liver Function Test",
# ========== ENDOCRINE/METABOLIC ==========
"DM": "Diabetes Mellitus",
"T1DM": "Type 1 Diabetes Mellitus",
"T2DM": "Type 2 Diabetes Mellitus",
"DKA": "Diabetic Ketoacidosis",
"HHS": "Hyperosmolar Hyperglycemic State",
"HbA1c": "Glycated Hemoglobin",
"TSH": "Thyroid Stimulating Hormone",
"BMI": "Body Mass Index",
"MetS": "Metabolic Syndrome",
# ========== RENAL/URINARY ==========
"CKD": "Chronic Kidney Disease",
"AKI": "Acute Kidney Injury",
"ESRD": "End-Stage Renal Disease",
"UTI": "Urinary Tract Infection",
"BPH": "Benign Prostatic Hyperplasia",
"PKD": "Polycystic Kidney Disease",
"ARF": "Acute Renal Failure",
"CRF": "Chronic Renal Failure",
# ========== HEMATOLOGY/ONCOLOGY ==========
"ALL": "Acute Lymphoblastic Leukemia",
"AML": "Acute Myeloid Leukemia",
"CLL": "Chronic Lymphocytic Leukemia",
"CML": "Chronic Myeloid Leukemia",
"NHL": "Non-Hodgkin Lymphoma",
"HL": "Hodgkin Lymphoma",
"MM": "Multiple Myeloma",
"MDS": "Myelodysplastic Syndrome",
"ITP": "Immune Thrombocytopenic Purpura",
"DIC": "Disseminated Intravascular Coagulation",
"HIT": "Heparin-Induced Thrombocytopenia",
# ========== INFECTIOUS DISEASE ==========
"HIV": "Human Immunodeficiency Virus",
"AIDS": "Acquired Immunodeficiency Syndrome",
"HCV": "Hepatitis C Virus",
"HBV": "Hepatitis B Virus",
"HAV": "Hepatitis A Virus",
"HSV": "Herpes Simplex Virus",
"CMV": "Cytomegalovirus",
"EBV": "Epstein-Barr Virus",
"MRSA": "Methicillin-Resistant Staphylococcus Aureus",
"VRE": "Vancomycin-Resistant Enterococcus",
"C diff": "Clostridioides difficile",
"STI": "Sexually Transmitted Infection",
"STD": "Sexually Transmitted Disease",
# ========== RHEUMATOLOGY/IMMUNOLOGY ==========
"RA": "Rheumatoid Arthritis",
"OA": "Osteoarthritis",
"SLE": "Systemic Lupus Erythematosus",
"AS": "Ankylosing Spondylitis",
"PSA": "Psoriatic Arthritis",
"SS": "Sjogren's Syndrome",
"MCTD": "Mixed Connective Tissue Disease",
"GCA": "Giant Cell Arteritis",
"PMR": "Polymyalgia Rheumatica",
# ========== DIAGNOSTIC IMAGING ==========
"CT": "Computed Tomography",
"MRI": "Magnetic Resonance Imaging",
"PET": "Positron Emission Tomography",
"US": "Ultrasound",
"CXR": "Chest X-Ray",
"KUB": "Kidneys Ureters Bladder",
"ERCP": "Endoscopic Retrograde Cholangiopancreatography",
"EGD": "Esophagogastroduodenoscopy",
"MRCP": "Magnetic Resonance Cholangiopancreatography",
# ========== PROCEDURES ==========
"CPR": "Cardiopulmonary Resuscitation",
"EKG": "Electrocardiogram",
"ECG": "Electrocardiogram",
"EEG": "Electroencephalogram",
"EMG": "Electromyography",
"LP": "Lumbar Puncture",
"I&D": "Incision and Drainage",
"D&C": "Dilation and Curettage",
"TURP": "Transurethral Resection of Prostate",
# ========== MEDICATIONS/DRUG CLASSES ==========
"NSAID": "Non-Steroidal Anti-Inflammatory Drug",
"ACE": "Angiotensin-Converting Enzyme",
"ARB": "Angiotensin Receptor Blocker",
"CCB": "Calcium Channel Blocker",
"BB": "Beta Blocker",
"SSRI": "Selective Serotonin Reuptake Inhibitor",
"SNRI": "Serotonin-Norepinephrine Reuptake Inhibitor",
"TCA": "Tricyclic Antidepressant",
"MAOI": "Monoamine Oxidase Inhibitor",
"PPI": "Proton Pump Inhibitor",
"H2RA": "Histamine-2 Receptor Antagonist",
"LMWH": "Low Molecular Weight Heparin",
"DOAC": "Direct Oral Anticoagulant",
"DMARD": "Disease-Modifying Antirheumatic Drug",
"TNF": "Tumor Necrosis Factor",
"IV": "Intravenous",
"IM": "Intramuscular",
"SQ": "Subcutaneous",
"PO": "Per Os (by mouth)",
"PR": "Per Rectum",
"SL": "Sublingual",
# ========== VITAL SIGNS/MEASUREMENTS ==========
"BP": "Blood Pressure",
"HR": "Heart Rate",
"RR": "Respiratory Rate",
"SpO2": "Oxygen Saturation",
"Temp": "Temperature",
"BS": "Blood Sugar",
"BG": "Blood Glucose",
"ABG": "Arterial Blood Gas",
"VBG": "Venous Blood Gas",
# ========== LAB VALUES ==========
"CBC": "Complete Blood Count",
"CMP": "Comprehensive Metabolic Panel",
"BMP": "Basic Metabolic Panel",
"LFT": "Liver Function Test",
"PT": "Prothrombin Time",
"PTT": "Partial Thromboplastin Time",
"INR": "International Normalized Ratio",
"ESR": "Erythrocyte Sedimentation Rate",
"CRP": "C-Reactive Protein",
"BNP": "B-type Natriuretic Peptide",
"Trop": "Troponin",
"PSA": "Prostate-Specific Antigen",
"TSH": "Thyroid Stimulating Hormone",
"T3": "Triiodothyronine",
"T4": "Thyroxine",
"WBC": "White Blood Cell",
"RBC": "Red Blood Cell",
"Hgb": "Hemoglobin",
"Hct": "Hematocrit",
"PLT": "Platelet",
# ========== SYMPTOMS/FINDINGS ==========
"SOB": "Shortness of Breath",
"CP": "Chest Pain",
"HA": "Headache",
"N/V": "Nausea/Vomiting",
"D/C": "Discontinue",
"C/O": "Complains Of",
"R/O": "Rule Out",
"H/O": "History Of",
"S/P": "Status Post",
# ========== SPECIALTIES ==========
"ED": "Emergency Department",
"ER": "Emergency Room",
"ICU": "Intensive Care Unit",
"CCU": "Cardiac Care Unit",
"NICU": "Neonatal Intensive Care Unit",
"PICU": "Pediatric Intensive Care Unit",
"OR": "Operating Room",
"PACU": "Post-Anesthesia Care Unit",
"OB": "Obstetrics",
"GYN": "Gynecology",
"ENT": "Ear Nose Throat",
# ========== OTHER COMMON ==========
"PRN": "As Needed",
"QD": "Once Daily",
"BID": "Twice Daily",
"TID": "Three Times Daily",
"QID": "Four Times Daily",
"HS": "At Bedtime",
"AC": "Before Meals",
"PC": "After Meals",
"NPO": "Nothing By Mouth",
"DNR": "Do Not Resuscitate",
"DNI": "Do Not Intubate",
"AMA": "Against Medical Advice",
"ADL": "Activities of Daily Living",
"ROM": "Range of Motion",
"PT": "Physical Therapy",
"OT": "Occupational Therapy",
"HPI": "History of Present Illness",
"PMH": "Past Medical History",
"PSH": "Past Surgical History",
"FH": "Family History",
"SH": "Social History",
}
def clean_description(description: str) -> str:
"""Removes the semantic tag like (finding) from the end of a description."""
return re.sub(r'\s\([^)]+\)$', '', description).strip()
# =============================================================================
# FINAL, MOST ROBUST STANDARDIZATION FUNCTION (v1.4 - Context-Aware)
# =============================================================================
def batch_standardize_entities(entities: list, aws_client, max_workers: int = 3) -> dict:
"""
Standardize multiple entities using PARALLEL AWS Comprehend calls.
Uses ThreadPoolExecutor for concurrent API calls (4-8x speedup).