From 7512e158ec1bec80b9f3b11b6fe27653f13a5dcd Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 22 Feb 2024 13:11:36 -0500 Subject: [PATCH 1/4] good start! --- practice.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 practice.py diff --git a/practice.py b/practice.py new file mode 100644 index 000000000..2c96181d9 --- /dev/null +++ b/practice.py @@ -0,0 +1,48 @@ +import logging +from pathlib import Path +from astropy.io import fits +from astrodb_scripts import load_astrodb, find_source_in_db +from simple.schema import * +from simple.utils.spectra import ingest_spectrum + +SAVE_DB = False # save the data files in addition to modifying the .db file +RECREATE_DB = False # recreates the .db file from the data files + +logger = logging.getLogger("AstroDB") +logger.setLevel(logging.INFO) + +db = load_astrodb("SIMPLE.sqlite", recreatedb=RECREATE_DB) + +data_directory = "/Users/kelle/Desktop/processed data set" +logging.info(f"Data directory: {data_directory}") + +fits_files = Path(data_directory).glob("*.fits") + +skipped = [] + +for file in fits_files: + msg = f"\n Processing {file}" + print(msg) + # logging.info(f"Processing {file}") + hdr = fits.getheader(file) + + # Check if source is in the database + matches = find_source_in_db(db, hdr["OBJECT"], ra=hdr["RA"], dec=hdr["DEC"]) + if len(matches) == 0: + skipped.append(file.name) + msg = f"Source {hdr['OBJECT']} not found in the database. Skipping." + print(msg) + # logging.info(f"Source {hdr['OBJECT']} not found in the database. Skipping.") + continue + elif len(matches) > 1: + skipped.append(file) + msg = f"Multiple matches found for {hdr['OBJECT']}. Skipping." + print(msg) + continue + elif len(matches) == 1: + source = matches[0] + msg = f"Source {hdr['OBJECT']} found in the database as {source}." + print(msg) + # logging.info(f"Source {hdr['OBJECT']} found in the database as {source}.") + +print(f"Skipped {len(skipped)} files: \n {skipped}") From 7a33b7aa5a797f55ae78f517d87bdb019fd05e76 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 22 Feb 2024 16:43:29 -0500 Subject: [PATCH 2/4] Logger fixed. file renamed. --- practice.py | 48 -------- scripts/ingests/pyspextool/practice_ingest.py | 114 ++++++++++++++++++ 2 files changed, 114 insertions(+), 48 deletions(-) delete mode 100644 practice.py create mode 100644 scripts/ingests/pyspextool/practice_ingest.py diff --git a/practice.py b/practice.py deleted file mode 100644 index 2c96181d9..000000000 --- a/practice.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -from pathlib import Path -from astropy.io import fits -from astrodb_scripts import load_astrodb, find_source_in_db -from simple.schema import * -from simple.utils.spectra import ingest_spectrum - -SAVE_DB = False # save the data files in addition to modifying the .db file -RECREATE_DB = False # recreates the .db file from the data files - -logger = logging.getLogger("AstroDB") -logger.setLevel(logging.INFO) - -db = load_astrodb("SIMPLE.sqlite", recreatedb=RECREATE_DB) - -data_directory = "/Users/kelle/Desktop/processed data set" -logging.info(f"Data directory: {data_directory}") - -fits_files = Path(data_directory).glob("*.fits") - -skipped = [] - -for file in fits_files: - msg = f"\n Processing {file}" - print(msg) - # logging.info(f"Processing {file}") - hdr = fits.getheader(file) - - # Check if source is in the database - matches = find_source_in_db(db, hdr["OBJECT"], ra=hdr["RA"], dec=hdr["DEC"]) - if len(matches) == 0: - skipped.append(file.name) - msg = f"Source {hdr['OBJECT']} not found in the database. Skipping." - print(msg) - # logging.info(f"Source {hdr['OBJECT']} not found in the database. Skipping.") - continue - elif len(matches) > 1: - skipped.append(file) - msg = f"Multiple matches found for {hdr['OBJECT']}. Skipping." - print(msg) - continue - elif len(matches) == 1: - source = matches[0] - msg = f"Source {hdr['OBJECT']} found in the database as {source}." - print(msg) - # logging.info(f"Source {hdr['OBJECT']} found in the database as {source}.") - -print(f"Skipped {len(skipped)} files: \n {skipped}") diff --git a/scripts/ingests/pyspextool/practice_ingest.py b/scripts/ingests/pyspextool/practice_ingest.py new file mode 100644 index 000000000..391273638 --- /dev/null +++ b/scripts/ingests/pyspextool/practice_ingest.py @@ -0,0 +1,114 @@ +""" +Script for ingesting IRTF SpeX spectra processed by pyspextool into the SIMPLE database. + +Expects spectra files to be uploaded to the S3 bucket in the following structure: +https://bdnyc.s3.amazonaws.com/SpeX/pyspextool/{filename} +""" + +import logging +from pathlib import Path +from astropy.io import fits +from astrodb_scripts import load_astrodb, find_source_in_db, AstroDBError +from simple.schema import * +from simple.utils.spectra import ingest_spectrum, spectrum_plottable + +SAVE_DB = False # save the data files in addition to modifying the .db file +RECREATE_DB = False # recreates the .db file from the data files + +logger = logging.getLogger("AstroDB") +logger.setLevel(logging.INFO) + +db = load_astrodb("SIMPLE.sqlite", recreatedb=RECREATE_DB) + +data_directory = "/Users/kelle/Desktop/processed data set" +logger.info(f"Data directory: {data_directory}") + +fits_files = Path(data_directory).glob("*.fits") + +total_files = 0 +skipped = [] + +for file in fits_files: + total_files += 1 + msg = f"\n Processing {file}" + logger.info(f"Processing {file}") + hdr = fits.getheader(file) + + # Check if source is in the database + matches = find_source_in_db(db, hdr["OBJECT"], ra=hdr["RA"], dec=hdr["DEC"]) + if len(matches) == 0: + skipped.append(file.name) + msg = f"Source {hdr['OBJECT']} not found in the database. Skipping." + logger.info(msg) + continue # exit loop and go to next file + elif len(matches) > 1: + skipped.append(file.name) + msg = f"Multiple matches found for {hdr['OBJECT']}. Skipping." + print(msg) + continue # exit loop and go to next file + elif len(matches) == 1: + source = matches[0] + msg = f"Source {hdr['OBJECT']} found in the database as {source}." + print(msg) + logging.info(msg) + + # Source is in the database, get other needed keywords from the header + regime = "nir" + + if hdr["TELESCOP"] == "NASA IRTF": + telescope = "IRTF" + else: + skipped.append(file.name) + msg = f"Telescope {hdr['TELESCOP']} not expected. Skipping." + print(msg) + continue + + if hdr["INSTR"] == "SpeX": + instrument = "SpeX" + else: + skipped.append(file.name) + msg = f"Instrument {hdr['INSTRUME']} not expected. Skipping." + print(msg) + continue + + if hdr["MODE"] == "Prism": + mode = "Prism" + elif hdr["MODE"] == "SXD": + mode = "SXD" + else: + skipped.append(file.name) + msg = f"Mode {hdr['MODE']} not expected. Skipping." + print(msg) + + obs_date = hdr["AVE_DATE"] + + reference = "Missing" + + other_references = f"{hdr['PROG_ID']}: {hdr['OBSERVER']}" + + spectrum = f"https://bdnyc.s3.amazonaws.com/SpeX/pyspextool/{file.name}" + + # check if the spectrum is plottable + try: + plottable = spectrum_plottable(spectrum, raise_error=True) + except AstroDBError as e: + skipped.append(file.name) + logger.info(f"Spectrum not plottable. Skipping {file.name}") + logger.debug(e) + continue + + # Ingest the spectrum + ingest_spectrum( + db, + source=source, + spectrum=spectrum, + regime=regime, + obs_date=obs_date, + telescope=telescope, + instrument=instrument, + mode=mode, + reference=reference, + other_references=other_references, + ) + +print(f"Skipped {len(skipped)} out of {total_files} files: \n {skipped}") From 3d97fd9bdc42a5c3dacafa959f544d59184937c8 Mon Sep 17 00:00:00 2001 From: kelle Date: Thu, 22 Feb 2024 16:59:18 -0500 Subject: [PATCH 3/4] polish --- scripts/ingests/pyspextool/practice_ingest.py | 64 ++++++++++++------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/scripts/ingests/pyspextool/practice_ingest.py b/scripts/ingests/pyspextool/practice_ingest.py index 391273638..ea63e1ba5 100644 --- a/scripts/ingests/pyspextool/practice_ingest.py +++ b/scripts/ingests/pyspextool/practice_ingest.py @@ -16,7 +16,7 @@ RECREATE_DB = False # recreates the .db file from the data files logger = logging.getLogger("AstroDB") -logger.setLevel(logging.INFO) +logger.setLevel(logging.WARNING) db = load_astrodb("SIMPLE.sqlite", recreatedb=RECREATE_DB) @@ -26,6 +26,7 @@ fits_files = Path(data_directory).glob("*.fits") total_files = 0 +ingested = [] skipped = [] for file in fits_files: @@ -39,12 +40,12 @@ if len(matches) == 0: skipped.append(file.name) msg = f"Source {hdr['OBJECT']} not found in the database. Skipping." - logger.info(msg) + logger.warning(msg) continue # exit loop and go to next file elif len(matches) > 1: skipped.append(file.name) msg = f"Multiple matches found for {hdr['OBJECT']}. Skipping." - print(msg) + logger.warning(msg) continue # exit loop and go to next file elif len(matches) == 1: source = matches[0] @@ -59,16 +60,16 @@ telescope = "IRTF" else: skipped.append(file.name) - msg = f"Telescope {hdr['TELESCOP']} not expected. Skipping." - print(msg) + msg = f"Telescope {hdr['TELESCOP']} not expected. Skipping. Expected NASA IRTF." + logger.warning(msg) continue if hdr["INSTR"] == "SpeX": instrument = "SpeX" else: skipped.append(file.name) - msg = f"Instrument {hdr['INSTRUME']} not expected. Skipping." - print(msg) + msg = f"Instrument {hdr['INSTRUME']} not expected. Skipping. Expected SpeX." + logger.warning(msg) continue if hdr["MODE"] == "Prism": @@ -77,8 +78,8 @@ mode = "SXD" else: skipped.append(file.name) - msg = f"Mode {hdr['MODE']} not expected. Skipping." - print(msg) + msg = f"Mode {hdr['MODE']} not expected. Skipping. Expected Prism or SXD." + logger.warning(msg) obs_date = hdr["AVE_DATE"] @@ -93,22 +94,41 @@ plottable = spectrum_plottable(spectrum, raise_error=True) except AstroDBError as e: skipped.append(file.name) - logger.info(f"Spectrum not plottable. Skipping {file.name}") + logger.warning(f"Spectrum not plottable. Skipping {file.name}") logger.debug(e) continue # Ingest the spectrum - ingest_spectrum( - db, - source=source, - spectrum=spectrum, - regime=regime, - obs_date=obs_date, - telescope=telescope, - instrument=instrument, - mode=mode, - reference=reference, - other_references=other_references, + try: + ingest_spectrum( + db, + source=source, + spectrum=spectrum, + regime=regime, + obs_date=obs_date, + telescope=telescope, + instrument=instrument, + mode=mode, + reference=reference, + other_references=other_references, + ) + ingested.append(file.name) + except AstroDBError as e: + skipped.append(file.name) + logger.warning(f"Error ingesting {file.name}. Skipping.") + logger.debug(e) + continue + +if len(ingested) + len(skipped) != total_files: + msg = ( + f"Some files were not ingested or skipped. \n" + f"n_ingested = {len(ingested)}, n_skipped = {len(skipped)}, total_files = {total_files}" ) + logger.error(msg) + raise AstroDBError + +if len(skipped) == 0: + logger.info(f"Ingested {len(ingested)} out of {total_files} files.") -print(f"Skipped {len(skipped)} out of {total_files} files: \n {skipped}") +if len(skipped) > 0: + logger.warning(f"Skipped {len(skipped)} out of {total_files} files: \n {skipped}") From b903bcb29bda394613541a5cfb5b05bb3cbacc2c Mon Sep 17 00:00:00 2001 From: kelle Date: Mon, 13 May 2024 14:13:17 -0400 Subject: [PATCH 4/4] added TODOs to the script. --- scripts/ingests/pyspextool/practice_ingest.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/ingests/pyspextool/practice_ingest.py b/scripts/ingests/pyspextool/practice_ingest.py index ea63e1ba5..a4b34cee0 100644 --- a/scripts/ingests/pyspextool/practice_ingest.py +++ b/scripts/ingests/pyspextool/practice_ingest.py @@ -10,6 +10,7 @@ from astropy.io import fits from astrodb_scripts import load_astrodb, find_source_in_db, AstroDBError from simple.schema import * +from simple.schema import REFERENCE_TABLES from simple.utils.spectra import ingest_spectrum, spectrum_plottable SAVE_DB = False # save the data files in addition to modifying the .db file @@ -18,12 +19,13 @@ logger = logging.getLogger("AstroDB") logger.setLevel(logging.WARNING) -db = load_astrodb("SIMPLE.sqlite", recreatedb=RECREATE_DB) +db = load_astrodb("SIMPLE.sqlite", recreatedb=RECREATE_DB, reference_tables=REFERENCE_TABLES) -data_directory = "/Users/kelle/Desktop/processed data set" +data_directory = "/Users/kelle/Desktop/processed data set" #proc directory logger.info(f"Data directory: {data_directory}") -fits_files = Path(data_directory).glob("*.fits") +#TODO - only ingest files renamed and converted to Spectrum1D fits +fits_files = Path(data_directory).glob("calspec*.fits") total_files = 0 ingested = [] @@ -83,7 +85,8 @@ obs_date = hdr["AVE_DATE"] - reference = "Missing" + #TODO: make a reference for the pyspextool team + reference = "Missing" other_references = f"{hdr['PROG_ID']}: {hdr['OBSERVER']}" @@ -99,6 +102,7 @@ continue # Ingest the spectrum + # TODO - ingest the new way. try: ingest_spectrum( db,