ecrum19
diff --git a/‎README.md‎
Lines changed: 14 additions & 0 deletions b/‎README.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎rules/README.md‎
Lines changed: 7 additions & 0 deletions b/‎rules/README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎rules/default_rules.ttl‎
Lines changed: 150 additions & 16 deletions b/‎rules/default_rules.ttl‎
Lines changed: 150 additions & 16 deletions
diff --git a/‎src/run_conversion.sh‎
Lines changed: 53 additions & 2 deletions b/‎src/run_conversion.sh‎
Lines changed: 53 additions & 2 deletions
@@ -81,6 +81,8 @@ In `full` mode with multiple VCF inputs, failures are isolated per input:
 - `-r, --rules` mapping rules file (`.ttl`)
   - default: `rules/default_rules.ttl`
 - `-l, --rdf-layout {aggregate,batch}` required in full mode
+- `-P, --spark-partitions` optional Spark partition hint (positive integer)
+  - low-cost way to reduce output part count by setting `spark.default.parallelism` and `spark.sql.shuffle.partitions`
 - `-k, --keep-tsv` keep hidden TSV intermediates
 - `-R, --keep-rdf` keep raw `.nt` after compression
 - `-e, --estimate-size` preflight size estimate
@@ -121,6 +123,18 @@ vcf-rdfizer \
   --rdf-layout batch \
   --compression hdt \
   --out ./results
+
+Full pipeline with low-cost partition cap (helps avoid too many tiny batch files):
+
+```bash
+vcf-rdfizer \
+  --mode full \
+  --input ./vcf_files \
+  --rdf-layout batch \
+  --spark-partitions 8 \
+  --compression hdt \
+  --out ./results
+```
 ```
 
 Full pipeline with custom rules + keep RDF:
 
@@ -11,10 +11,17 @@ This directory contains RML mappings used by the conversion pipeline.
     - `/data/tsv/file_metadata.tsv`
     - `/data/tsv/header_lines.tsv`
     - `/data/tsv/records.tsv`
+    - `/data/tsv/sample_calls.tsv`
+    - `/data/tsv/sample_format_values.tsv`
+  - `sample_calls.tsv` and `sample_format_values.tsv` are derived by the Python wrapper
+    from `records.tsv` at runtime so FORMAT fields (e.g., `GT:DP:AD`) can be
+    mapped to per-sample values consistently.
   - The Python wrapper rewrites these template paths per input VCF to:
     - `/data/tsv/<sample>.file_metadata.tsv`
     - `/data/tsv/<sample>.header_lines.tsv`
     - `/data/tsv/<sample>.records.tsv`
+    - `/data/tsv/<sample>.sample_calls.tsv`
+    - `/data/tsv/<sample>.sample_format_values.tsv`
 
 ## How To Create A Custom Mapping
 
 
@@ -11,6 +11,8 @@
 # - /data/tsv/<sample>.file_metadata.tsv
 # - /data/tsv/<sample>.header_lines.tsv
 # - /data/tsv/<sample>.records.tsv
+# - /data/tsv/<sample>.sample_calls.tsv (derived by wrapper)
+# - /data/tsv/<sample>.sample_format_values.tsv (derived by wrapper)
 # The wrapper rewrites the template paths below for each input sample.
 #
 # Output format:
@@ -31,19 +33,12 @@
     rml:referenceFormulation ql:CSV
   ] ;
   rr:subjectMap [
-    rr:template "https://w3id.org/vcf-rdfizer/resource/vcf-file/{SOURCE_FILE}" ;
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}" ;
     rr:class vcfr:VCFFile
   ] ;
   rr:predicateObjectMap [
     rr:predicate vcfr:hasHeader ;
-    rr:objectMap [ rr:template "https://w3id.org/vcf-rdfizer/resource/vcf-header/{SOURCE_FILE}" ]
-  ] ;
-  rr:predicateObjectMap [
-    rr:predicate vcfr:hasRecord ;
-    rr:objectMap [
-      rr:parentTriplesMap <#VCFRecordMap> ;
-      rr:joinCondition [ rr:child "SOURCE_FILE" ; rr:parent "SOURCE_FILE" ]
-    ]
+    rr:objectMap [ rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/header" ]
   ] ;
   rr:predicateObjectMap [
     rr:predicate vcfr:fileFormat ;
@@ -73,14 +68,59 @@
     rml:referenceFormulation ql:CSV
   ] ;
   rr:subjectMap [
-    rr:template "https://w3id.org/vcf-rdfizer/resource/vcf-header/{SOURCE_FILE}" ;
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/header" ;
     rr:class vcfr:VCFHeader
   ] ;
+  .
+
+# NOTE:
+# Use explicit link maps (template-based) instead of parentTriplesMap joins.
+# In distributed RMLStreamer execution, joins can duplicate parent map triples.
+<#VCFFileToRecordLinkMap>
+  a rr:TriplesMap ;
+  rml:logicalSource [
+    rml:source [
+      a csvw:Table ;
+      csvw:url "/data/tsv/records.tsv" ;
+      csvw:dialect [
+        a csvw:Dialect ;
+        csvw:delimiter "\t" ;
+        csvw:headerRowCount 1
+      ]
+    ] ;
+    rml:referenceFormulation ql:CSV
+  ] ;
+  rr:subjectMap [
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}"
+  ] ;
+  rr:predicateObjectMap [
+    rr:predicate vcfr:hasRecord ;
+    rr:objectMap [
+      rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/record/var{ROW_ID}"
+    ]
+  ] .
+
+<#VCFHeaderToHeaderLineLinkMap>
+  a rr:TriplesMap ;
+  rml:logicalSource [
+    rml:source [
+      a csvw:Table ;
+      csvw:url "/data/tsv/header_lines.tsv" ;
+      csvw:dialect [
+        a csvw:Dialect ;
+        csvw:delimiter "\t" ;
+        csvw:headerRowCount 1
+      ]
+    ] ;
+    rml:referenceFormulation ql:CSV
+  ] ;
+  rr:subjectMap [
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/header"
+  ] ;
   rr:predicateObjectMap [
     rr:predicate vcfr:hasHeaderLine ;
     rr:objectMap [
-      rr:parentTriplesMap <#HeaderLineMap> ;
-      rr:joinCondition [ rr:child "SOURCE_FILE" ; rr:parent "SOURCE_FILE" ]
+      rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/header/line/{HEADER_INDEX}"
     ]
   ] .
 
@@ -99,7 +139,7 @@
     rml:referenceFormulation ql:CSV
   ] ;
   rr:subjectMap [
-    rr:template "https://w3id.org/vcf-rdfizer/resource/header-line/{SOURCE_FILE}/{HEADER_INDEX}" ;
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/header/line/{HEADER_INDEX}" ;
     rr:class vcfr:HeaderLine
   ] ;
   rr:predicateObjectMap [
@@ -126,7 +166,7 @@
     rml:referenceFormulation ql:CSV
   ] ;
   rr:subjectMap [
-    rr:template "https://w3id.org/vcf-rdfizer/resource/record/{SOURCE_FILE}/{ROW_ID}" ;
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/record/var{ROW_ID}" ;
     rr:class vcfr:VCFRecord
   ] ;
   rr:predicateObjectMap [
@@ -151,7 +191,7 @@
   ] ;
   rr:predicateObjectMap [
     rr:predicate vcfr:hasCall ;
-    rr:objectMap [ rr:template "https://w3id.org/vcf-rdfizer/resource/call/{SOURCE_FILE}/{ROW_ID}" ]
+    rr:objectMap [ rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/call/var{ROW_ID}" ]
   ] .
 
 <#VariantCallMap>
@@ -169,7 +209,7 @@
     rml:referenceFormulation ql:CSV
   ] ;
   rr:subjectMap [
-    rr:template "https://w3id.org/vcf-rdfizer/resource/call/{SOURCE_FILE}/{ROW_ID}" ;
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/call/var{ROW_ID}" ;
     rr:class vcfr:VariantCall
   ] ;
   rr:predicateObjectMap [
@@ -184,3 +224,97 @@
     rr:predicate vcfr:formatRaw ;
     rr:objectMap [ rml:reference "FORMAT" ]
   ] .
+
+<#VariantCallToSampleLinkMap>
+  a rr:TriplesMap ;
+  rml:logicalSource [
+    rml:source [
+      a csvw:Table ;
+      csvw:url "/data/tsv/sample_calls.tsv" ;
+      csvw:dialect [
+        a csvw:Dialect ;
+        csvw:delimiter "\t" ;
+        csvw:headerRowCount 1
+      ]
+    ] ;
+    rml:referenceFormulation ql:CSV
+  ] ;
+  rr:subjectMap [
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/call/var{ROW_ID}"
+  ] ;
+  rr:predicateObjectMap [
+    rr:predicate vcfr:hasSampleCall ;
+    rr:objectMap [
+      rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/sample/var{ROW_ID}/{SAMPLE_URI_ID}"
+    ]
+  ] .
+
+<#SampleCallMap>
+  a rr:TriplesMap ;
+  rml:logicalSource [
+    rml:source [
+      a csvw:Table ;
+      csvw:url "/data/tsv/sample_calls.tsv" ;
+      csvw:dialect [
+        a csvw:Dialect ;
+        csvw:delimiter "\t" ;
+        csvw:headerRowCount 1
+      ]
+    ] ;
+    rml:referenceFormulation ql:CSV
+  ] ;
+  rr:subjectMap [
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/sample/var{ROW_ID}/{SAMPLE_URI_ID}" ;
+    rr:class vcfr:SampleCall
+  ] ;
+  rr:predicateObjectMap [
+    rr:predicate vcfr:sampleId ;
+    rr:objectMap [ rml:reference "SAMPLE_ID" ]
+  ] .
+
+<#SampleCallToFormatValueLinkMap>
+  a rr:TriplesMap ;
+  rml:logicalSource [
+    rml:source [
+      a csvw:Table ;
+      csvw:url "/data/tsv/sample_format_values.tsv" ;
+      csvw:dialect [
+        a csvw:Dialect ;
+        csvw:delimiter "\t" ;
+        csvw:headerRowCount 1
+      ]
+    ] ;
+    rml:referenceFormulation ql:CSV
+  ] ;
+  rr:subjectMap [
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/sample/var{ROW_ID}/{SAMPLE_URI_ID}"
+  ] ;
+  rr:predicateObjectMap [
+    rr:predicate vcfr:hasFormatValue ;
+    rr:objectMap [
+      rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/sample/var{ROW_ID}/{SAMPLE_URI_ID}/fmt/{FORMAT_KEY}"
+    ]
+  ] .
+
+<#FormatFieldValueMap>
+  a rr:TriplesMap ;
+  rml:logicalSource [
+    rml:source [
+      a csvw:Table ;
+      csvw:url "/data/tsv/sample_format_values.tsv" ;
+      csvw:dialect [
+        a csvw:Dialect ;
+        csvw:delimiter "\t" ;
+        csvw:headerRowCount 1
+      ]
+    ] ;
+    rml:referenceFormulation ql:CSV
+  ] ;
+  rr:subjectMap [
+    rr:template "https://w3id.org/vcf-rdfizer/vcf/{SOURCE_FILE}/sample/var{ROW_ID}/{SAMPLE_URI_ID}/fmt/{FORMAT_KEY}" ;
+    rr:class vcfr:FormatFieldValue
+  ] ;
+  rr:predicateObjectMap [
+    rr:predicate vcfr:fieldValue ;
+    rr:objectMap [ rml:reference "FORMAT_VALUE" ]
+  ] .
@@ -72,6 +72,42 @@ stat_size() {
     fi
     return
   fi
+
+  echo 0
+}
+
+# Report comparable input VCF bytes.
+# - .vcf    -> on-disk bytes
+# - .vcf.gz -> decompressed bytes
+# - dir     -> sum of normalized sizes for contained .vcf/.vcf.gz files
+normalized_vcf_size() {
+  local path="$1"
+  local total=0
+
+  if [[ -f "$path" ]]; then
+    if [[ "$path" == *.vcf.gz ]]; then
+      gzip -dc "$path" | wc -c | tr -d ' '
+      return
+    fi
+    stat_size "$path"
+    return
+  fi
+
+  if [[ -d "$path" ]]; then
+    shopt -s nullglob
+    for file in "$path"/*.vcf "$path"/*.vcf.gz; do
+      if [[ ! -f "$file" ]]; then
+        continue
+      fi
+      size=$(normalized_vcf_size "$file")
+      total=$((total + size))
+    done
+    shopt -u nullglob
+    echo "$total"
+    return
+  fi
+
+  echo 0
 }
 
 have_gnu_time() { [[ -x /usr/bin/time ]] && /usr/bin/time --version >/dev/null 2>&1; }
@@ -140,7 +176,22 @@ JAVA_VERSION=$(java -version 2>&1 | head -n1 | sed 's/"/\\"/g')
 # or for Java 8: GC_OPTS="-Xloggc:$LOGDIR/gc-$RUN_ID.log -XX:+PrintGCDetails -XX:+PrintGCDateStamps"
 GC_OPTS=${GC_OPTS:-}
 
-JAVA_CMD=(java -jar "$JAR" toFile -m "$IN" -o "$OUT_DIR/$OUT_NAME")
+# Optional low-cost Spark partition hint for RMLStreamer execution.
+# When set, this caps Spark default parallelism and shuffle partitions to
+# reduce tiny output-part overproduction without introducing expensive
+# repartition/shuffle stages in the pipeline.
+SPARK_PARTITIONS=${SPARK_PARTITIONS:-}
+JAVA_SPARK_OPTS=()
+if [[ -n "$SPARK_PARTITIONS" ]]; then
+  if [[ "$SPARK_PARTITIONS" =~ ^[1-9][0-9]*$ ]]; then
+    JAVA_SPARK_OPTS+=("-Dspark.default.parallelism=$SPARK_PARTITIONS")
+    JAVA_SPARK_OPTS+=("-Dspark.sql.shuffle.partitions=$SPARK_PARTITIONS")
+  else
+    echo "WARNING: ignoring invalid SPARK_PARTITIONS='$SPARK_PARTITIONS' (expected positive integer)." >&2
+  fi
+fi
+
+JAVA_CMD=(java "${JAVA_SPARK_OPTS[@]}" -jar "$JAR" toFile -m "$IN" -o "$OUT_DIR/$OUT_NAME")
 
 # Ensure repeated runs with the same OUT_NAME do not accumulate old artifacts.
 if [[ -d "$OUT_DIR/$OUT_NAME" ]]; then
@@ -149,7 +200,7 @@ fi
 
 # ---------- Pre-run ----------
 IN_SIZE=$(stat_size "$IN")
-VCF_SIZE=$(stat_size "$IN_VCF")
+VCF_SIZE=$(normalized_vcf_size "$IN_VCF")
 
 # ---------- Run RMLStreamer with timing ----------
 EXIT_CODE=0