diff --git a/BioExt/align/__init__.py b/BioExt/align/__init__.py index 835c6b4..b1285c7 100644 --- a/BioExt/align/__init__.py +++ b/BioExt/align/__init__.py @@ -305,6 +305,8 @@ def __call__( self.__cached_insertion_matrix ) + #print ("\n", score, "\n", "\n\n".join ([ref_, query_, str(open_insertion), str(extend_insertion), str (open_deletion), str(extend_deletion), str (miscall_cost), str (do_local), str (do_affine), ref_aligned.decode ('utf-8'), query_aligned.decode ('utf-8')])) + if sys.version_info >= (3, 0): ref_aligned = ref_aligned.decode('utf-8') query_aligned = query_aligned.decode('utf-8') diff --git a/BioExt/graphing/__init__.py b/BioExt/graphing/__init__.py index f2d9dc0..faa14ab 100644 --- a/BioExt/graphing/__init__.py +++ b/BioExt/graphing/__init__.py @@ -155,8 +155,11 @@ def _magic_ticks(lwr, upr, div=5): return ticks -def count_alignment(alignment, columns='all', refidx=None, limit=100): +def count_alignment(alignment, columns='all', refidx=None, limit=100, embedded_counts = None): records = [] + + if embedded_counts is not None: + import re if columns is None or columns == 'all': r = next(iter(alignment)) @@ -217,6 +220,7 @@ def count_alignment(alignment, columns='all', refidx=None, limit=100): counts = np.zeros((s, N), dtype=float) def allrecords(): + i = 0 for i, r in records: yield r for i, r in enumerate(alignment, start=i): @@ -226,17 +230,25 @@ def allrecords(): for r in allrecords(): for j, c in enumerate(columns): + if embedded_counts is not None: + m = embedded_counts.search (r.name) + if m is not None: + weight = float (m.group(1)) + else: + weight = 1. + ltr = r[c].upper() + if ltr in skips: continue elif ltr in ambigs: - frac = 1 / len(ambigs[ltr]) + frac = weight / len(ambigs[ltr]) for ltr_ in ambigs[ltr]: i = letters.index(ltr_) counts[i, j] += frac elif ltr in letters: i = letters.index(ltr) - counts[i, j] += 1 + counts[i, j] += weight else: raise ValueError('unknown letter: {0}'.format(ltr)) diff --git a/BioExt/misc/__init__.py b/BioExt/misc/__init__.py index b4765c0..4f27bfb 100644 --- a/BioExt/misc/__init__.py +++ b/BioExt/misc/__init__.py @@ -309,7 +309,7 @@ def compute_cigar(reference, record, reference_name=None, new_style=False): # inject the annotations and yield record.annotations['CIGAR'] = cigar record.annotations['edit_distance'] = edit_distance - record.annotations['position'] = start + record.annotations['position'] = start record.annotations['length'] = end - start record.annotations['reference_name'] = reference_name diff --git a/scripts/msa2json b/scripts/msa2json new file mode 100755 index 0000000..a8a20f7 --- /dev/null +++ b/scripts/msa2json @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import sys, json, signal + +from argparse import ArgumentParser, FileType + +from Bio import AlignIO + +from BioExt.graphing import count_alignment + +def main(args=None): + if args is None: + args = sys.argv[1:] + + try: + signal.signal(signal.SIGPIPE, signal.SIG_DFL) + except ValueError: + pass + + parser = ArgumentParser(description='translate a FASTA nucleotide file') + parser.add_argument('-f', '--frame', type=int, choices=range(3), default=0) + parser.add_argument('-c', '--counts', action='store_true') + parser.add_argument('input', nargs='?', type=FileType('r'), default=sys.stdin) + parser.add_argument('output', nargs='?', type=FileType('w'), default=sys.stdout) + + ns = parser.parse_args(args) + + alignment = AlignIO.read(ns.input, 'fasta') + r = next(iter(alignment)) + + matcher = None + if ns.counts: + import re + matcher = re.compile ('\:([0-9]+)$') + + counts, alphabet = count_alignment (alignment, columns = list(range(len(r.seq))), limit = 1000000, embedded_counts = matcher) + + amino_acid_counts = {} + alphabet_letters = alphabet[0] + + for k in range (len(counts[0])): + amino_acid_counts [k + 1] = {} + for a_letter in range (len(alphabet_letters)): + if counts[a_letter][k] > 0.0: + amino_acid_counts [k + 1][alphabet_letters[a_letter]] = counts[a_letter][k] + if len (amino_acid_counts [k + 1]) == 0: + amino_acid_counts.pop (k+1) + + json.dump (amino_acid_counts, ns.output) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/setup.py b/setup.py index b97bef4..32582e7 100644 --- a/setup.py +++ b/setup.py @@ -120,7 +120,8 @@ 'scripts/bealign', 'scripts/begraph', 'scripts/consensus', - 'scripts/translate' + 'scripts/translate', + 'scripts/msa2json', # 'scripts/variants' ], ext_modules=ext_modules,