diff --git a/examples/polars_recipe.py b/examples/polars_recipe.py new file mode 100644 index 0000000..14ed1c0 --- /dev/null +++ b/examples/polars_recipe.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +"""Small Polars recipe for Generated Gallery JSONL.GZ exports. + +Usage: + python examples/polars_recipe.py + python examples/polars_recipe.py https://generatedgallery.com/index/generated-gallery.prompts.jsonl.gz cinematic 5 10000 +""" +import gzip +import io +import json +import sys +import urllib.request +from pathlib import Path + +import polars as pl + +DEFAULT_URL = "https://generatedgallery.com/index/generated-gallery.prompts.jsonl.gz" +DEFAULT_LABEL = "cinematic" +DEFAULT_SAMPLE_SIZE = 5 +DEFAULT_LIMIT = 10000 + + +def open_stream(location: str): + if location.startswith(("http://", "https://")): + req = urllib.request.Request( + location, + headers={"User-Agent": "generatedgallery-polars-recipe/1.0"}, + ) + with urllib.request.urlopen(req, timeout=60) as response: + raw = response.read() + bio = io.BytesIO(raw) + if location.endswith(".gz"): + return io.TextIOWrapper(gzip.GzipFile(fileobj=bio), encoding="utf-8") + return io.TextIOWrapper(bio, encoding="utf-8") + + path = Path(location) + if path.suffix == ".gz": + return gzip.open(path, "rt", encoding="utf-8") + return path.open("r", encoding="utf-8") + + +def load_rows(location: str, limit: int): + rows = [] + with open_stream(location) as handle: + for line in handle: + line = line.strip() + if not line: + continue + record = json.loads(line) + labels = record.get("labels", {}) + generation = record.get("generation", {}) + source = record.get("source", {}) + rows.append({ + "id": record.get("id"), + "styles": labels.get("styles") or [], + "subjects": labels.get("subjects") or [], + "model_family": labels.get("model_family"), + "prompt": generation.get("prompt") or "", + "source_url": source.get("url") or record.get("url"), + }) + if len(rows) >= limit: + break + return pl.DataFrame(rows) + + +def print_list_top(df: pl.DataFrame, field: str, label: str): + top = ( + df.select(pl.col(field).explode().drop_nulls().alias("value")) + .group_by("value") + .len() + .sort("len", descending=True) + .head(10) + ) + print(f"\nTop {label}:") + for row in top.iter_rows(named=True): + print(f"- {row['value']}: {row['len']}") + + +def print_model_top(df: pl.DataFrame): + top = ( + df.select(pl.col("model_family")) + .drop_nulls() + .group_by("model_family") + .len() + .sort("len", descending=True) + .head(10) + ) + print("\nTop model families:") + for row in top.iter_rows(named=True): + print(f"- {row['model_family']}: {row['len']}") + + +def print_samples(df: pl.DataFrame, label: str, sample_size: int): + sample = ( + df.filter( + pl.col("styles").list.contains(label) + | pl.col("subjects").list.contains(label) + | (pl.col("model_family") == label) + ) + .select("prompt", "source_url") + .head(sample_size) + ) + print(f"\nSample rows for label '{label}':") + if sample.height == 0: + print("- no rows matched") + return + for row in sample.iter_rows(named=True): + prompt = row["prompt"].replace("\n", " ").strip() + if len(prompt) > 180: + prompt = f"{prompt[:177]}..." + print(f"- prompt: {prompt}") + print(f" source: {row['source_url']}") + + +def main(): + location = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_URL + label = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_LABEL + sample_size = int(sys.argv[3]) if len(sys.argv) > 3 else DEFAULT_SAMPLE_SIZE + limit = int(sys.argv[4]) if len(sys.argv) > 4 else DEFAULT_LIMIT + + df = load_rows(location, limit=limit) + print(f"Loaded {df.height} rows from: {location}") + print_list_top(df, "styles", "styles") + print_list_top(df, "subjects", "subjects") + print_model_top(df) + print_samples(df, label=label, sample_size=sample_size) + print("\nMedia rights note: upstream creators/platforms keep rights to source media. Verify rights before reuse.") + + +if __name__ == "__main__": + main()