11 July 2022

Partial Digest C++ implementation

https://github.com/leocfong/bio/blob/main/bio/digest.cpp
The rest of the code is in GitHub
#include "digest.h"
#include <iostream>
#include <vector>
#include "Util.h"
using namespace std;
vector<int> FindDigest(vector<int>& X)
{
    vector<int> v;
    size_t n = X.size();
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < i; j++)
        {
            int g = abs(X[i] - X[j]);

            v.push_back(g);
        }
    }
    return v;
}
void Place(vector<int>& L, vector<int>& X, const int width, std::vector<int>& Answer)
{
    if (L.size() == 0)
    {
        Answer = X;
        cout << endl << "*****************" << endl;
        cout << "Answer:" << endl;
        Render(Answer);
        cout << endl<<  "*****************" << endl;
        return;
    }
    int y = FindMax(L);
    vector<int> r1 = DeltaToVector(X, y);
    if (IsSubsetOf(r1, L))
    { 
        X.push_back(y);
        RemoveSubset(L, r1);
        cout << "A-X:"; Render(X); cout << "  L:"; Render(L); cout << "  r1:"; Render(r1); cout << endl;
        Place(L, X, width, Answer);
        cout << "B-X:"; Render(X); cout << "  L:"; Render(L); cout << "  r1:"; Render(r1); cout << endl;
        RemoveFromVector(X, y);
        Add(L, r1);
    }
    int wy = width - y;
    vector<int> r2 = DeltaToVector(X, wy);
    if (IsSubsetOf(r2, L))
    {
        X.push_back(wy);
        RemoveSubset(L, r2);
        cout << "A-X:"; Render(X); cout << "  L:"; Render(L); cout << "  r2:"; Render(r2); cout << endl;
        Place(L, X, width, Answer);
        cout << "B-X:"; Render(X); cout << "  L:"; Render(L); cout << "  r2:"; Render(r2); cout << endl;
        RemoveFromVector(X, wy);
        Add(L, r2);
    }
    return;
}
void PartialDigest(vector<int>& L, vector<int>& X, std::vector<int>& Answer)
{
   int width = FindMax(L);
   RemoveFromVector(L, width);
   X.push_back(0); X.push_back(width);
   cout << "X:";Render(X);cout << "  L:";Render(L); cout << endl;
   Place(L, X, width, Answer);
}

05 March 2022

Enumerating k-mers Lexicographically solution for ROSALIND - https://rosalind.info/problems/lexf/

 f = open("out.txt", "w")  
 input_file = open("rosalind_lexf.txt", "r")  
 lines = input_file.readlines()  
 char_array = lines[0].replace('\n', '').split(' ')  
 seq_len = int(lines[1])  
 char_array.sort()  
 char_array_len = len(char_array)  
 rep_len = []  
 rep_inc_counter = []  
 rep_char_index = []  
 rep_len_i = 1  
 for i in range(seq_len):  
   rep_inc_counter.append(0)  
   rep_char_index.append(0)  
   rep_len.append(rep_len_i)  
   rep_len_i = rep_len_i * char_array_len  
 num_combination = char_array_len ** seq_len  
 C = []  
 for i in range(num_combination):  
   s = ""  
   for l in range(seq_len):   
     s = char_array[rep_char_index[l]] + s  
     rep_inc_counter[l] = rep_inc_counter[l] + 1  
     if rep_inc_counter[l] == rep_len[l]:  
       rep_char_index[l] = rep_char_index[l] + 1  
       rep_inc_counter[l] = 0   
     if rep_char_index[l]==char_array_len:  
       rep_char_index[l] = 0  
   C.append(s)  
 for s in C:  
   print(s)  
   f.write(s+'\n')  
 f.close()     

27 February 2022

ROSALIND RNA Splicing solution - https://rosalind.info/problems/splc/

 import re  
 from asyncio.windows_events import NULL  
 f = open("out.txt", "w")  
 condon_len = 3  
 test= "TCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG"  
 dna_map = {  
   "TTT":"F", "TCT":"S", "TAT":"Y", "TGT":"C",   
 "TTC":"F", "TCC":"S", "TAC":"Y", "TGC":"C",   
 "TTA":"L", "TCA":"S", "TAA":"STOP", "TGA":"STOP",   
 "TTG":"L", "TCG":"S", "TAG":"STOP", "TGG":"W",   
 "CTT":"L", "CCT":"P", "CAT":"H", "CGT":"R",   
 "CTC":"L", "CCC":"P", "CAC":"H", "CGC":"R",   
 "CTA":"L", "CCA":"P", "CAA":"Q", "CGA":"R",   
 "CTG":"L", "CCG":"P", "CAG":"Q", "CGG":"R",   
 "ATT":"I", "ACT":"T", "AAT":"N", "AGT":"S",   
 "ATC":"I", "ACC":"T", "AAC":"N", "AGC":"S",   
 "ATA":"I", "ACA":"T", "AAA":"K", "AGA":"R",   
 "ATG":"M", "ACG":"T", "AAG":"K", "AGG":"R",   
 "GTT":"V", "GCT":"A", "GAT":"D", "GGT":"G",   
 "GTC":"V", "GCC":"A", "GAC":"D", "GGC":"G",   
 "GTA":"V", "GCA":"A", "GAA":"E", "GGA":"G",   
 "GTG":"V", "GCG":"A", "GAG":"E", "GGG":"G"  
 }  
 def GetAA(condon):  
   return dna_map[condon]  
 class Sequence:  
   def __init__(self, name, seq):  
     self.name = name  
     self.seq = seq  
     self.found = []  
   def p(self):  
     print("seq name:" + self.name + "len:" + str(len(self.seq)))   
     print("seq:" + self.seq)  
     for s in self.found:  
       print(s)   
   def pAA(self):  
     print (self.found[0])   
     f.write(self.found[0])     
   def Translate(self):  
     seq_len = len(self.seq)  
     aa_seq=""  
     start_pos = 0  
     end_pos = condon_len  
     while end_pos <= seq_len:  
       condon = self.seq[start_pos:end_pos]  
       AA = GetAA(condon)  
       if AA=="STOP":  
         break  
       aa_seq = aa_seq + AA  
       start_pos = start_pos + condon_len  
       end_pos = end_pos + condon_len  
     self.found.append(aa_seq)    
 input_file = open("rosalind_splc.txt", "r")  
 lines = input_file.readlines()  
 list = []  
 s = NULL  
 for l in lines:  
   if (l[0]==">"):  
     if s != NULL:  
       list.append(s)  
     s = Sequence(l.replace('\n', ''), '')  
   else:  
     s.seq = s.seq + l.replace('\n', '')  
 list.append(s)      
 list_len = len(list)  
 for i in range(1, list_len):  
   list[0].seq = list[0].seq.replace(list[i].seq, '')  
 list[0].Translate()  
 list[0].pAA()    
 f.close()     

26 February 2022

ROSALIND - Locating Restriction Sites solution to https://rosalind.info/problems/revp/

 c_map = {  
   "A":"T",  
   "G":"C",  
   "C":"G",  
   "T":"A"  
 }  
 def GetComplement(seq):  
   r = ''.join([c_map[x] for x in seq])  
   return r  
 def GetRevComplement(seq):  
   c = GetComplement(seq)   
   return c[::-1]    
 f = open("out.txt", "w")  
 input_file = open("rosalind_revp.txt", "r")  
 seq = ""  
 lines = input_file.readlines()  
 for l in lines:  
   if (l[0]!=">"):  
     seq = seq + l.replace("\n", "").replace("\r","")  
 seq_len = len(seq)  
 start_len = 2  
 end_len = 6  
 found_list = {}  
 for i in range(start_len, end_len+1):   
   for j in range(0, seq_len-(i*2)+1):  
     first_half = seq[j:j+i]  
     second_half =seq[j+i:j+i*2]  
     rc_second_half = GetRevComplement(second_half)  
     if (first_half == rc_second_half):  
       #print(first_half + " " + second_half + " " + rc_second_half + " " + str(j+1) + " " +str(i*2))  
       found_list[j+1]=i*2  
 sorted_found_list = sorted(found_list.items(), key=lambda item: item[0])  
 for key, value in sorted_found_list:  
   print(str(key) + " " + str(value))        
   f.write(str(key) + " " + str(value) + '\n')  
 f.close()    

25 February 2022

Enumerating Gene Orders - https://rosalind.info/problems/perm/

 from itertools import permutations  
 import math  
 f = open("out.txt", "w")  
 num=5  
 s=1   
 e=s+num  
 p=math.factorial(num)  
 print(p)  
 f.write(str(p) + '\n')  
 l = list(permutations(range(s, e)))  
 for i in range(p):  
   for j in range(num):  
     print(l[i][j], end=' ')  
     f.write(str(l[i][j]) + ' ')  
   print('')    
   f.write('\n')  
 f.close()    

Calculating Protein Mass - https://rosalind.info/problems/prtm/

 mmt = {  
 "A":71.03711,  
 "C":103.00919,  
 "D":115.02694,  
 "E":129.04259,  
 "F":147.06841,  
 "G":57.02146,  
 "H":137.05891,  
 "I":113.08406,  
 "K":128.09496,  
 "L":113.08406,  
 "M":131.04049,  
 "N":114.04293,  
 "P":97.05276,  
 "Q":128.05858,  
 "R":156.10111,  
 "S":87.03203,  
 "T":101.04768,  
 "V":99.06841,  
 "W":186.07931,  
 "Y":163.06333 }  
 input_file = open("rosalind_prtm.txt", "r")  
 AA = input_file.read().replace('\n','')  
 print(sum([mmt[x] for x in AA]))  

Answer for ROSALIND Open Reading Frames https://rosalind.info/problems/orf/

 import re  
 condon_len = 3  
 c_map = {  
   "A":"T",  
   "G":"C",  
   "C":"G",  
   "T":"A"  
 }  
 dna_map = {  
   "TTT":"F", "TCT":"S", "TAT":"Y", "TGT":"C",   
 "TTC":"F", "TCC":"S", "TAC":"Y", "TGC":"C",   
 "TTA":"L", "TCA":"S", "TAA":"STOP", "TGA":"STOP",   
 "TTG":"L", "TCG":"S", "TAG":"STOP", "TGG":"W",   
 "CTT":"L", "CCT":"P", "CAT":"H", "CGT":"R",   
 "CTC":"L", "CCC":"P", "CAC":"H", "CGC":"R",   
 "CTA":"L", "CCA":"P", "CAA":"Q", "CGA":"R",   
 "CTG":"L", "CCG":"P", "CAG":"Q", "CGG":"R",   
 "ATT":"I", "ACT":"T", "AAT":"N", "AGT":"S",   
 "ATC":"I", "ACC":"T", "AAC":"N", "AGC":"S",   
 "ATA":"I", "ACA":"T", "AAA":"K", "AGA":"R",   
 "ATG":"M", "ACG":"T", "AAG":"K", "AGG":"R",   
 "GTT":"V", "GCT":"A", "GAT":"D", "GGT":"G",   
 "GTC":"V", "GCC":"A", "GAC":"D", "GGC":"G",   
 "GTA":"V", "GCA":"A", "GAA":"E", "GGA":"G",   
 "GTG":"V", "GCG":"A", "GAG":"E", "GGG":"G"  
 }  
 found = []  
 input_file = open("rosalind_orf.txt", "r")  
 def GetComplement(seq):  
   r = ''.join([c_map[x] for x in seq])  
   return r  
 def GetAA(condon):  
   return dna_map[condon]  
 def GetAASeq(dna_seq, pos):  
   dna_seq_len = len(dna_seq)  
   seq=""  
   start_pos = pos  
   end_pos = pos + condon_len  
   condon = dna_seq[start_pos:end_pos]  
   AA = GetAA(condon)  
   while AA != "STOP":  
     seq = seq + AA  
     start_pos = start_pos + condon_len  
     end_pos = end_pos + condon_len  
     if (end_pos> (dna_seq_len-1)):  
       return "" #end of seq reached without finding stop condon  
     condon = dna_seq[start_pos:end_pos]  
     AA = GetAA(condon)  
   found.append(seq)    
   return seq    
 dna_seq = ""  
 lines = input_file.readlines()  
 for l in lines:  
   if (l[0]!=">"):  
     dna_seq = dna_seq + l.replace("\n", "").replace("\r","")  
 r_dna_seq = GetComplement(dna_seq[::-1])  
 seq_regex="ATG"  
 p = re.compile(seq_regex)  
 for m in p.finditer(dna_seq):  
   GetAASeq(dna_seq, m.start())  
 for m in p.finditer(r_dna_seq):  
   GetAASeq(r_dna_seq, m.start())    
 for s in list(set(found)):  
   print(s)     

17 February 2022

ROSALIND - Inferring mRNA from Protein

Answer to https://rosalind.info/problems/mrna/
 def GetMRna(AA):  
   l_aa = {key for key, value in aa_map.items() if value == AA}  
   return l_aa  
 input_file = open("rosalind_mrna.txt", "r")  
 aa_seq = input_file.read().replace("\n", "")  
 print(aa_seq)  
 aa_map = {  
   "UUU":"F", "UCU":"S", "UAU":"Y", "UGU":"C",   
 "UUC":"F", "UCC":"S", "UAC":"Y", "UGC":"C",   
 "UUA":"L", "UCA":"S", "UAA":"STOP", "UGA":"STOP",   
 "UUG":"L", "UCG":"S", "UAG":"STOP", "UGG":"W",   
 "CUU":"L", "CCU":"P", "CAU":"H", "CGU":"R",   
 "CUC":"L", "CCC":"P", "CAC":"H", "CGC":"R",   
 "CUA":"L", "CCA":"P", "CAA":"Q", "CGA":"R",   
 "CUG":"L", "CCG":"P", "CAG":"Q", "CGG":"R",   
 "AUU":"I", "ACU":"T", "AAU":"N", "AGU":"S",   
 "AUC":"I", "ACC":"T", "AAC":"N", "AGC":"S",   
 "AUA":"I", "ACA":"T", "AAA":"K", "AGA":"R",   
 "AUG":"M", "ACG":"T", "AAG":"K", "AGG":"R",   
 "GUU":"V", "GCU":"A", "GAU":"D", "GGU":"G",   
 "GUC":"V", "GCC":"A", "GAC":"D", "GGC":"G",   
 "GUA":"V", "GCA":"A", "GAA":"E", "GGA":"G",   
 "GUG":"V", "GCG":"A", "GAG":"E", "GGG":"G",   
 }  
 condon_len = []  
 stop_condons_len = 3  
 aa_seq_len = len(aa_seq)  
 for i in range(aa_seq_len):  
   condons = GetMRna(aa_seq[i])  
   condons_len = len(condons)  
   stop_condons_len = stop_condons_len * condons_len  
 print("Answer:" + str(stop_condons_len % 1000000) )      

06 February 2022

Rosalind - Answer to Finding a Protein Motif

Answer to Finding a Protein Motif https://rosalind.info/problems/mprt/
 import urllib.request  
 import re  
 from numpy import mat  
 #N-glycosylation motif  
 MotifRegex = "N[^P][ST][^P]"  
 file_out = open("output.txt", "w")  
 #Class to process protein sequence  
 class Sequence:  
   def __init__(self, name, seq):  
     self.name = name  
     self.seq = seq  
   def FindReg(self, reg):  
     match_string = ""  
     for match in re.finditer('(?={0})'.format(MotifRegex), self.seq):  
       index = match.start()+1  
       if len(match_string) == 0:  
         match_string = str(index)  
       else:  
         match_string = match_string + " " + str(index)  
     return match_string   
   def PrintOutcome(self):  
     match_string = self.FindReg(MotifRegex)  
     if len(match_string)>0:  
       print(self.name)   
       print(match_string)  
       file_out.write(self.name + '\n')  
       file_out.write(match_string + '\n')  
 #Get protein sequence from https://www.uniprot.org/uniprot/  
 def GetProtein(name):  
   url_base = "https://www.uniprot.org/uniprot/"  
   url_prefix = ".fasta"  
   url = url_base + name + url_prefix  
   with urllib.request.urlopen(url) as response:  
     html = response.read()  
     lines = html.decode().split("\n")    
     for line in lines:  
       if len(line)>1 and line[0] == ">":  
         protein = Sequence(name, "")  
       else:  
         protein.seq = protein.seq+line  
   return protein        
 ProteinList = []  
 input_file = open("rosalind_mprt.txt", "r")  
 #input_file = open("input.txt", "r")  
 lines = input_file.readlines()  
 for line in lines:  
   Protein = GetProtein(line.replace("\n", ""))  
   ProteinList.append(Protein)  
 for Protein in ProteinList:  
   Protein.PrintOutcome()    
 file_out.close()  

04 January 2022

Found a really useful rapidxml example

 #include <string.h>  
 #include <stdio.h>  
 #include <iostream>  
 #include <fstream>  
 #include <vector>  
 #include "rapidxml.hpp"  
 using namespace rapidxml;  
 using namespace std;  
 int main()  
 {  
           cout << "Parsing my beer journal..." << endl;  
           xml_document<> doc;  
           xml_node<>* root_node;  
           // Read the xml file into a vector  
           ifstream theFile("C:\\Work3\\ReadXml\\beerJournal.xml");  
           vector<char> buffer((istreambuf_iterator<char>(theFile)), istreambuf_iterator<char>());  
           buffer.push_back('\0');  
           // Parse the buffer using the xml file parsing library into doc   
           doc.parse<0>(&buffer[0]);  
           // Find our root node  
           root_node = doc.first_node("MyBeerJournal");  
           // Iterate over the brewerys  
           for (xml_node<>* brewery_node = root_node->first_node("Brewery"); brewery_node; brewery_node = brewery_node->next_sibling())  
           {  
                     printf("I have visited %s in %s. ",  
                               brewery_node->first_attribute("name")->value(),  
                               brewery_node->first_attribute("location")->value());  
                     // Interate over the beers  
                     for (xml_node<>* beer_node = brewery_node->first_node("Beer"); beer_node; beer_node = beer_node->next_sibling())  
                     {  
                               printf("On %s, I tried their %s which is a %s. ",  
                                         beer_node->first_attribute("dateSampled")->value(),  
                                         beer_node->first_attribute("name")->value(),  
                                         beer_node->first_attribute("description")->value());  
                               printf("I gave it the following review: %s", beer_node->value());  
                     }  
                     cout << endl;  
           }  
 }  
 <?xml version="1.0" encoding="utf-8"?>  
 <MyBeerJournal>  
   <Brewery name="Founders Brewing Company" location="Grand Rapids, MI">  
     <Beer name="Centennial" description="IPA" rating="A+" dateSampled="01/02/2011">  
       "What an excellent IPA. This is the most delicious beer I have ever tasted!"  
     </Beer>  
   </Brewery>  
   <Brewery name="Brewery Vivant" location="Grand Rapids, MI">  
     <Beer name="Farmhouse Ale" description="Belgian Ale" rating="B" dateSampled="02/07/2015">  
       This beer is not so good... but I am not that big of a fan of english style ales.  
     </Beer>  
   </Brewery>  
   <Brewery name="Bells Brewery" location="Kalamazoo, MI">  
     <Beer name="Two Hearted Ale" description="IPA" rating="A" dateSampled="03/15/2012">  
       Another execllent brew. Two Hearted gives Founders Centennial a run for it's money.  
     </Beer>  
   </Brewery>  
 </MyBeerJournal>