Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions modelseedpy/core/msgenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,27 @@ def parse_fasta_str(faa_str, split=DEFAULT_SPLIT, h_func=None):
return features


def read_gbff_records_from_file(filename: str):
if filename.endswith(".gbff"):
with open(filename, "r") as fh:
return read_gbff_records(fh)
elif filename.endswith(".gz"):
import gzip
from io import StringIO

with gzip.open(filename, "rb") as fh:
return read_gbff_records(StringIO(fh.read().decode("utf-8")))


def read_gbff_records(handler):
from Bio import SeqIO

gbff_records = []
for record in SeqIO.parse(handler, "gb"):
gbff_records.append(record)
return gbff_records


def extract_features(faa_str, split=DEFAULT_SPLIT, h_func=None):
features = []
active_seq = None
Expand Down Expand Up @@ -168,6 +189,45 @@ def from_fasta(filename, split=" ", h_func=None):
genome.features += read_fasta2(filename, split, h_func)
return genome

@staticmethod
def from_gbff_sequence(filename):
gbff_records = read_gbff_records_from_file(filename)
genome = MSGenome()
features = []
for rec in gbff_records:
feature = MSFeature(rec.id, str(rec.seq), description=rec.description)
features.append(feature)
genome.features += features
return genome

@staticmethod
def from_gbff_features(
filename, feature_id_qualifier="protein_id", description_qualifier="product"
):
gbff_records = read_gbff_records_from_file(filename)
genome = MSGenome()
features = []
for rec in gbff_records:
for f in rec.features:
if f.type == "CDS":
translations = f.qualifiers.get("translation", [])
if len(translations) == 1:
feature_id = f.qualifiers.get(feature_id_qualifier, [None])[0]
description = f.qualifiers.get(description_qualifier, [None])[0]
if feature_id:
feature = MSFeature(
feature_id, translations[0], description=description
)
features.append(feature)
else:
logger.warning(
f"skip feature: unable to fetch id from qualifier {feature_id_qualifier}"
)
elif len(translations) > 1:
logger.warning(f"skip feature: with multiple sequences {f}")
genome.features += features
return genome

def to_fasta(self, filename, l=80, fn_header=None):
to_fasta(self.features, filename, l, fn_header)
return filename
Expand Down Expand Up @@ -203,3 +263,85 @@ def _repr_html_(self):
<td>{len(self.features)}</td>
</tr>
</table>"""


class GenomeGff(MSGenome):
def __init__(self, contigs):
self.contigs = contigs
super().__init__()

@staticmethod
def read_sequence(feature_id, gff_record, expected_sequence, contigs):
from Bio.Seq import Seq
from Bio import Align

protein_seq_cds = expected_sequence
feature_contig = contigs.features.get_by_id(gff_record.contig_id)
seq = Seq(feature_contig.seq[gff_record.start - 1 : gff_record.end])
if gff_record.strand == "-":
seq = seq.reverse_complement()
seq_from_dna = str(seq.translate())
if len(seq_from_dna) > 0 and seq_from_dna[-1] == "*":
seq_from_dna = seq_from_dna[:-1]
if len(protein_seq_cds) > 0 and protein_seq_cds[-1] == "*":
protein_seq_cds = protein_seq_cds[:-1]
eq = protein_seq_cds == seq_from_dna

score = None
if not eq and len(seq_from_dna) > 0:
try:
aligner = Align.PairwiseAligner()
res = aligner.align(protein_seq_cds, seq_from_dna)
score = res.score
except ValueError as ex:
print("error", gff_record)
raise ex

feature = MSFeature(feature_id, protein_seq_cds)
feature.description = f"score: {score}"
feature.gff = gff_record
return feature

@staticmethod
def from_fna_faa_gff(
filename_fna, filename_faa, filename_gff, _fn_get_id, prodigal=False
):
genome_gff_features = _read_gff_features(filename_gff)
genome_faa = MSGenome.from_fasta(filename_faa)
contigs = MSGenome.from_fasta(filename_fna)

feature_lookup = {}
if prodigal:
for feature in genome_faa.features:
attr = dict(
x.split("=")
for x in feature.description.split(" # ")[-1].split(";")
)
if attr["ID"] not in feature_lookup:
feature_lookup[attr["ID"]] = feature
else:
raise ValueError("")
else:
feature_lookup = {feature.id: feature for feature in genome_faa.features}

features = []
for gff_record in genome_gff_features:
if gff_record.feature_type == "CDS":
feature_id = gff_record.attr.get("ID")
if _fn_get_id:
feature_id = _fn_get_id(gff_record)

feature_cds = feature_lookup.get(feature_id)

if feature_cds:
protein_seq_cds = feature_cds.seq
f = GenomeGff.read_sequence(
feature_id, gff_record, protein_seq_cds, contigs
)
features.append(f)
else:
print(f"not found {feature_id}")

genome = GenomeGff(contigs)
genome.features += features
return genome
2 changes: 1 addition & 1 deletion modelseedpy/core/mstemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ def from_dict(d, template):
d.get("pigment", 0),
d.get("carbohydrate", 0),
d.get("energy", 0),
d.get("other", 0)
d.get("other", 0),
)
for item in d["templateBiomassComponents"]:
biocomp = MSTemplateBiomassComponent.from_dict(item, template)
Expand Down
12 changes: 8 additions & 4 deletions modelseedpy/core/rast_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self):
},
]

def annotate_genome(self, genome):
def annotate_genome(self, genome, split_terms=True):
p_features = []
for f in genome.features:
if f.seq and len(f.seq) > 0:
Expand All @@ -70,9 +70,13 @@ def annotate_genome(self, genome):
for o in res[0]["features"]:
feature = genome.features.get_by_id(o["id"])
if "function" in o:
functions = re.split("; | / | @", o["function"])
for function in functions:
feature.add_ontology_term("RAST", function)
rast_function = o["function"]
if split_terms:
functions = re.split("; | / | @", rast_function)
for function in functions:
feature.add_ontology_term("RAST", function)
else:
feature.add_ontology_term("RAST", rast_function)

return res[0]["analysis_events"]

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"

[tool.black]
line-length = 88
python-version = ['py36']
python-version = ['py38']
include = '\.pyi?$'
exclude = '''
(
Expand Down