diff --git a/modelseedpy/core/msgenome.py b/modelseedpy/core/msgenome.py index 7b75173..c423427 100644 --- a/modelseedpy/core/msgenome.py +++ b/modelseedpy/core/msgenome.py @@ -82,6 +82,27 @@ def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None): return features +def read_gbff_records_from_file(filename: str): + if filename.endswith(".gbff"): + with open(filename, "r") as fh: + return read_gbff_records(fh) + elif filename.endswith(".gz"): + import gzip + from io import StringIO + + with gzip.open(filename, "rb") as fh: + return read_gbff_records(StringIO(fh.read().decode("utf-8"))) + + +def read_gbff_records(handler): + from Bio import SeqIO + + gbff_records = [] + for record in SeqIO.parse(handler, "gb"): + gbff_records.append(record) + return gbff_records + + def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None): features = [] active_seq = None @@ -168,6 +189,45 @@ def from_fasta(filename, split=" ", h_func=None): genome.features += read_fasta2(filename, split, h_func) return genome + @staticmethod + def from_gbff_sequence(filename): + gbff_records = read_gbff_records_from_file(filename) + genome = MSGenome() + features = [] + for rec in gbff_records: + feature = MSFeature(rec.id, str(rec.seq), description=rec.description) + features.append(feature) + genome.features += features + return genome + + @staticmethod + def from_gbff_features( + filename, feature_id_qualifier="protein_id", description_qualifier="product" + ): + gbff_records = read_gbff_records_from_file(filename) + genome = MSGenome() + features = [] + for rec in gbff_records: + for f in rec.features: + if f.type == "CDS": + translations = f.qualifiers.get("translation", []) + if len(translations) == 1: + feature_id = f.qualifiers.get(feature_id_qualifier, [None])[0] + description = f.qualifiers.get(description_qualifier, [None])[0] + if feature_id: + feature = MSFeature( + feature_id, translations[0], description=description + ) + features.append(feature) + else: + logger.warning( + f"skip feature: unable to fetch id from qualifier {feature_id_qualifier}" + ) + elif len(translations) > 1: + logger.warning(f"skip feature: with multiple sequences {f}") + genome.features += features + return genome + def to_fasta(self, filename, l=80, fn_header=None): to_fasta(self.features, filename, l, fn_header) return filename @@ -203,3 +263,85 @@ def _repr_html_(self): {len(self.features)} """ + + +class GenomeGff(MSGenome): + def __init__(self, contigs): + self.contigs = contigs + super().__init__() + + @staticmethod + def read_sequence(feature_id, gff_record, expected_sequence, contigs): + from Bio.Seq import Seq + from Bio import Align + + protein_seq_cds = expected_sequence + feature_contig = contigs.features.get_by_id(gff_record.contig_id) + seq = Seq(feature_contig.seq[gff_record.start - 1 : gff_record.end]) + if gff_record.strand == "-": + seq = seq.reverse_complement() + seq_from_dna = str(seq.translate()) + if len(seq_from_dna) > 0 and seq_from_dna[-1] == "*": + seq_from_dna = seq_from_dna[:-1] + if len(protein_seq_cds) > 0 and protein_seq_cds[-1] == "*": + protein_seq_cds = protein_seq_cds[:-1] + eq = protein_seq_cds == seq_from_dna + + score = None + if not eq and len(seq_from_dna) > 0: + try: + aligner = Align.PairwiseAligner() + res = aligner.align(protein_seq_cds, seq_from_dna) + score = res.score + except ValueError as ex: + print("error", gff_record) + raise ex + + feature = MSFeature(feature_id, protein_seq_cds) + feature.description = f"score: {score}" + feature.gff = gff_record + return feature + + @staticmethod + def from_fna_faa_gff( + filename_fna, filename_faa, filename_gff, _fn_get_id, prodigal=False + ): + genome_gff_features = _read_gff_features(filename_gff) + genome_faa = MSGenome.from_fasta(filename_faa) + contigs = MSGenome.from_fasta(filename_fna) + + feature_lookup = {} + if prodigal: + for feature in genome_faa.features: + attr = dict( + x.split("=") + for x in feature.description.split(" # ")[-1].split(";") + ) + if attr["ID"] not in feature_lookup: + feature_lookup[attr["ID"]] = feature + else: + raise ValueError("") + else: + feature_lookup = {feature.id: feature for feature in genome_faa.features} + + features = [] + for gff_record in genome_gff_features: + if gff_record.feature_type == "CDS": + feature_id = gff_record.attr.get("ID") + if _fn_get_id: + feature_id = _fn_get_id(gff_record) + + feature_cds = feature_lookup.get(feature_id) + + if feature_cds: + protein_seq_cds = feature_cds.seq + f = GenomeGff.read_sequence( + feature_id, gff_record, protein_seq_cds, contigs + ) + features.append(f) + else: + print(f"not found {feature_id}") + + genome = GenomeGff(contigs) + genome.features += features + return genome diff --git a/modelseedpy/core/mstemplate.py b/modelseedpy/core/mstemplate.py index 6bdc27a..cffc610 100644 --- a/modelseedpy/core/mstemplate.py +++ b/modelseedpy/core/mstemplate.py @@ -646,7 +646,7 @@ def from_dict(d, template): d.get("pigment", 0), d.get("carbohydrate", 0), d.get("energy", 0), - d.get("other", 0) + d.get("other", 0), ) for item in d["templateBiomassComponents"]: biocomp = MSTemplateBiomassComponent.from_dict(item, template) diff --git a/modelseedpy/core/rast_client.py b/modelseedpy/core/rast_client.py index 97211ad..cc36b7b 100644 --- a/modelseedpy/core/rast_client.py +++ b/modelseedpy/core/rast_client.py @@ -60,7 +60,7 @@ def __init__(self): }, ] - def annotate_genome(self, genome): + def annotate_genome(self, genome, split_terms=True): p_features = [] for f in genome.features: if f.seq and len(f.seq) > 0: @@ -70,9 +70,13 @@ def annotate_genome(self, genome): for o in res[0]["features"]: feature = genome.features.get_by_id(o["id"]) if "function" in o: - functions = re.split("; | / | @", o["function"]) - for function in functions: - feature.add_ontology_term("RAST", function) + rast_function = o["function"] + if split_terms: + functions = re.split("; | / | @", rast_function) + for function in functions: + feature.add_ontology_term("RAST", function) + else: + feature.add_ontology_term("RAST", rast_function) return res[0]["analysis_events"] diff --git a/pyproject.toml b/pyproject.toml index 0ed5854..8e0e52d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -python-version = ['py36'] +python-version = ['py38'] include = '\.pyi?$' exclude = ''' (