1+ class CompressedGene :
2+ def __init__ (self , gene : str ):
3+ self ._compress (gene )
4+
5+ def _compress (self , gene : str ):
6+ self .bit_string : int = 1 # start with sentinel
7+ for nucleotide in gene .upper ():
8+ self .bit_string <<= 2 # shift left two bits
9+ if nucleotide == "A" : # change last two bits to 00
10+ self .bit_string |= 0b00
11+ elif nucleotide == "C" : # change last two bits to 01
12+ self .bit_string |= 0b01
13+ elif nucleotide == "G" : # change last two bits to 10
14+ self .bit_string |= 0b10
15+ elif nucleotide == "T" : # change last two bits to 11
16+ self .bit_string |= 0b11
17+ else :
18+ raise ValueError ("Invalid Nucleotide:{}" .format (nucleotide ))
19+
20+ def decompress (self ) -> str :
21+ gene : str = ""
22+ for i in range (0 , self .bit_string .bit_length () - 1 , 2 ): # - 1 to exclude sentinel
23+ bits : int = self .bit_string >> i & 0b11 # get just 2 relevant bits
24+ if bits == 0b00 : # A
25+ gene += "A"
26+ elif bits == 0b01 : # C
27+ gene += "C"
28+ elif bits == 0b10 : # G
29+ gene += "G"
30+ elif bits == 0b11 : # T
31+ gene += "T"
32+ else :
33+ raise ValueError ("Invalid bits:{}" .format (bits ))
34+ return gene [::- 1 ] # [::-1] reverses string by slicing backwards
35+
36+ def __str__ (self ) -> str : # string representation for pretty printing
37+ return self .decompress ()
38+
39+
40+ if __name__ == "__main__" :
41+ from sys import getsizeof
42+ original : str = "TAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATATAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATA" * 100
43+ print ("original is {} bytes" .format (getsizeof (original )))
44+ compressed : CompressedGene = CompressedGene (original ) # compress
45+ print ("compressed is {} bytes" .format (getsizeof (compressed .bit_string )))
46+ print (compressed ) # decompress
47+ print ("original and decompressed are the same: {}" .format (original == compressed .decompress ()))
0 commit comments