diff --git a/modelseedpy/core/msgenome.py b/modelseedpy/core/msgenome.py
index 7b75173..c423427 100644
--- a/modelseedpy/core/msgenome.py
+++ b/modelseedpy/core/msgenome.py
@@ -82,6 +82,27 @@ def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None):
return features
+def read_gbff_records_from_file(filename: str):
+ if filename.endswith(".gbff"):
+ with open(filename, "r") as fh:
+ return read_gbff_records(fh)
+ elif filename.endswith(".gz"):
+ import gzip
+ from io import StringIO
+
+ with gzip.open(filename, "rb") as fh:
+ return read_gbff_records(StringIO(fh.read().decode("utf-8")))
+
+
+def read_gbff_records(handler):
+ from Bio import SeqIO
+
+ gbff_records = []
+ for record in SeqIO.parse(handler, "gb"):
+ gbff_records.append(record)
+ return gbff_records
+
+
def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None):
features = []
active_seq = None
@@ -168,6 +189,45 @@ def from_fasta(filename, split=" ", h_func=None):
genome.features += read_fasta2(filename, split, h_func)
return genome
+ @staticmethod
+ def from_gbff_sequence(filename):
+ gbff_records = read_gbff_records_from_file(filename)
+ genome = MSGenome()
+ features = []
+ for rec in gbff_records:
+ feature = MSFeature(rec.id, str(rec.seq), description=rec.description)
+ features.append(feature)
+ genome.features += features
+ return genome
+
+ @staticmethod
+ def from_gbff_features(
+ filename, feature_id_qualifier="protein_id", description_qualifier="product"
+ ):
+ gbff_records = read_gbff_records_from_file(filename)
+ genome = MSGenome()
+ features = []
+ for rec in gbff_records:
+ for f in rec.features:
+ if f.type == "CDS":
+ translations = f.qualifiers.get("translation", [])
+ if len(translations) == 1:
+ feature_id = f.qualifiers.get(feature_id_qualifier, [None])[0]
+ description = f.qualifiers.get(description_qualifier, [None])[0]
+ if feature_id:
+ feature = MSFeature(
+ feature_id, translations[0], description=description
+ )
+ features.append(feature)
+ else:
+ logger.warning(
+ f"skip feature: unable to fetch id from qualifier {feature_id_qualifier}"
+ )
+ elif len(translations) > 1:
+ logger.warning(f"skip feature: with multiple sequences {f}")
+ genome.features += features
+ return genome
+
def to_fasta(self, filename, l=80, fn_header=None):
to_fasta(self.features, filename, l, fn_header)
return filename
@@ -203,3 +263,85 @@ def _repr_html_(self):
{len(self.features)} |
"""
+
+
+class GenomeGff(MSGenome):
+ def __init__(self, contigs):
+ self.contigs = contigs
+ super().__init__()
+
+ @staticmethod
+ def read_sequence(feature_id, gff_record, expected_sequence, contigs):
+ from Bio.Seq import Seq
+ from Bio import Align
+
+ protein_seq_cds = expected_sequence
+ feature_contig = contigs.features.get_by_id(gff_record.contig_id)
+ seq = Seq(feature_contig.seq[gff_record.start - 1 : gff_record.end])
+ if gff_record.strand == "-":
+ seq = seq.reverse_complement()
+ seq_from_dna = str(seq.translate())
+ if len(seq_from_dna) > 0 and seq_from_dna[-1] == "*":
+ seq_from_dna = seq_from_dna[:-1]
+ if len(protein_seq_cds) > 0 and protein_seq_cds[-1] == "*":
+ protein_seq_cds = protein_seq_cds[:-1]
+ eq = protein_seq_cds == seq_from_dna
+
+ score = None
+ if not eq and len(seq_from_dna) > 0:
+ try:
+ aligner = Align.PairwiseAligner()
+ res = aligner.align(protein_seq_cds, seq_from_dna)
+ score = res.score
+ except ValueError as ex:
+ print("error", gff_record)
+ raise ex
+
+ feature = MSFeature(feature_id, protein_seq_cds)
+ feature.description = f"score: {score}"
+ feature.gff = gff_record
+ return feature
+
+ @staticmethod
+ def from_fna_faa_gff(
+ filename_fna, filename_faa, filename_gff, _fn_get_id, prodigal=False
+ ):
+ genome_gff_features = _read_gff_features(filename_gff)
+ genome_faa = MSGenome.from_fasta(filename_faa)
+ contigs = MSGenome.from_fasta(filename_fna)
+
+ feature_lookup = {}
+ if prodigal:
+ for feature in genome_faa.features:
+ attr = dict(
+ x.split("=")
+ for x in feature.description.split(" # ")[-1].split(";")
+ )
+ if attr["ID"] not in feature_lookup:
+ feature_lookup[attr["ID"]] = feature
+ else:
+ raise ValueError("")
+ else:
+ feature_lookup = {feature.id: feature for feature in genome_faa.features}
+
+ features = []
+ for gff_record in genome_gff_features:
+ if gff_record.feature_type == "CDS":
+ feature_id = gff_record.attr.get("ID")
+ if _fn_get_id:
+ feature_id = _fn_get_id(gff_record)
+
+ feature_cds = feature_lookup.get(feature_id)
+
+ if feature_cds:
+ protein_seq_cds = feature_cds.seq
+ f = GenomeGff.read_sequence(
+ feature_id, gff_record, protein_seq_cds, contigs
+ )
+ features.append(f)
+ else:
+ print(f"not found {feature_id}")
+
+ genome = GenomeGff(contigs)
+ genome.features += features
+ return genome
diff --git a/modelseedpy/core/mstemplate.py b/modelseedpy/core/mstemplate.py
index 6bdc27a..cffc610 100644
--- a/modelseedpy/core/mstemplate.py
+++ b/modelseedpy/core/mstemplate.py
@@ -646,7 +646,7 @@ def from_dict(d, template):
d.get("pigment", 0),
d.get("carbohydrate", 0),
d.get("energy", 0),
- d.get("other", 0)
+ d.get("other", 0),
)
for item in d["templateBiomassComponents"]:
biocomp = MSTemplateBiomassComponent.from_dict(item, template)
diff --git a/modelseedpy/core/rast_client.py b/modelseedpy/core/rast_client.py
index 97211ad..cc36b7b 100644
--- a/modelseedpy/core/rast_client.py
+++ b/modelseedpy/core/rast_client.py
@@ -60,7 +60,7 @@ def __init__(self):
},
]
- def annotate_genome(self, genome):
+ def annotate_genome(self, genome, split_terms=True):
p_features = []
for f in genome.features:
if f.seq and len(f.seq) > 0:
@@ -70,9 +70,13 @@ def annotate_genome(self, genome):
for o in res[0]["features"]:
feature = genome.features.get_by_id(o["id"])
if "function" in o:
- functions = re.split("; | / | @", o["function"])
- for function in functions:
- feature.add_ontology_term("RAST", function)
+ rast_function = o["function"]
+ if split_terms:
+ functions = re.split("; | / | @", rast_function)
+ for function in functions:
+ feature.add_ontology_term("RAST", function)
+ else:
+ feature.add_ontology_term("RAST", rast_function)
return res[0]["analysis_events"]
diff --git a/pyproject.toml b/pyproject.toml
index 0ed5854..8e0e52d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
[tool.black]
line-length = 88
-python-version = ['py36']
+python-version = ['py38']
include = '\.pyi?$'
exclude = '''
(