1   
   2  """ This script is used to evalulate the mapping results """ 
   3   
   4  import HTSeq 
   5  import cPickle as pickle 
   6  import numpy as np 
   7  import time 
   8  import sys 
   9  import re 
  10  import subprocess 
  11  import random  
  12  import os 
  13   
  14   
  15   
  16  alngts = 0 
  17  AlignedReadsdic = {} 
  18  algnedToRef = 0 
  19  algnedToArt = 0 
  20   
  21  """ Helper debug functions.... """ 
  23      pickle.dump(dictionaary, open(outputname + ".p", "wb")) 
  24      print("Saved .pickle!") 
   25   
  27      dictionary = pickle.load(open(inputname + ".p")) 
  28      print("Loaded " + inputname + ".pickle!") 
  29      return (dictionary) 
   30       
  31   
  33      """ 
  34      Class for exhaustive reading of the sam alignments. 
  35       
  36      """ 
  37       
  38 -    def __init__(self, readname, mr, nm, subs, score, mq, start, end, gaps, mism): 
   39          self.id = readname       
  40          self.mr = [mr]           
  41          self.subs =[subs]        
  42          self.nm = [nm]           
  43          self.rq = score          
  44          self.mq = [mq]           
  45          self.start = [start]     
  46          self.end = [end]         
  47                                       
  48          self.gaps = [gaps] 
  49          self.mism = [mism] 
   50           
  52          self.mr.append(read.mr[0])           
  53          self.subs.append(read.subs[0])      
  54          self.nm.append(read.nm[0])          
  55          self.mq.append(read.mq[0])           
  56          self.start.append(read.start[0])      
  57          self.end.append(read.end[0])        
  58          self.gaps.append(read.gaps[0]) 
  59          self.mism.append(read.mism[0]) 
   60           
  61 -    def toStr(self, readname):  
   62          """Converts the object to a string """ 
  63          str = "" 
  64   
  65          for i, read in enumerate(self.mr): 
  66               
  67              str += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, self.mr[i], self.subs[i], self.nm[i], self.rq, self.mq[i], self.start[i], self.end[i], self.gaps[i], self.mism[i]) 
  68          return(str) 
  69       
  71          """Converts the object to a string """ 
  72   
  73          if identifier == "art": 
  74              str = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (self.id, 0, self.subs[0], self.nm[0], self.rq, self.mq[0], self.start[0], self.end[0], self.gaps[0], self.mism[0]) 
  75          else: 
  76              str = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (self.id, 1, 0, self.nm[0], self.rq, self.mq[0], self.start[0], self.end[0], self.gaps[0], self.mism[0]) 
  77          return(str) 
  78           
  79           
  80   
  82      """ Class for exhaustive reading of the sam alignments. Only for TP """ 
  83 -    def __init__(self, nm, score, mq, start, end, gaps, mism): 
   84          self.nm = [nm]        
  85          self.rq = score                   
  86          self.mq = [mq]       
  87          self.start = [start]  
  88          self.end = [end]     
  89          self.gaps = [gaps] 
  90          self.mism = [mism] 
   91           
  93          self.nm.append( read.nm[0])          
  94          self.mq.append(read.mq[0])           
  95          self.start.append( read.start[0])      
  96          self.end.append( read.end[0])    
  97          self.gaps.append( read.gaps[0]) 
  98          self.mism.append(read.mism[0])    
   99   
 100 -    def toStr(self, readname):  
  101          str = "" 
 102          for i, read in enumerate(self.mr): 
 103              str += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, self.nm[i], self.rq, self.mq[i], self.start[i], self.end[i], self.gaps[i], self.mism[i]) 
 104          return(str) 
 105           
 106      """ returns 1 if an equal alignment is contained in the array """ 
 108          for i, position in enumerate(self.start): 
 109               
 110              if position == read.start and self.end[i] == read.end: 
 111                  return(1) 
 112          return(0) 
  113       
 114   
 116      """ Class for efficient addressing for np. arrays! """ 
 117 -    def __init__(self, internalID, quality): 
  118          self.internalID = internalID 
 119          self.quality = quality 
   120   
 121   
 122   
 123   
 124   
 126      """ 
 127      Funtion which transforms an HTSeq alignment to a TPRead class. 
 128   
 129      @type  alngt: alignment 
 130      @param alngt: alignment from the sam file. 
 131      @type  compareList: list 
 132      @param compareList: list which indicates differences between reference / artificial 
 133      @type  readdic: dictionary 
 134      @param readdic: Contains ranks and qualities for every entry in the fastq file 
 135       
 136      @rtype:   TPRead 
 137      @return:  Transformed ReadObject. 
 138      """ 
 139      cigar = alngt.cigar 
 140      gaps  = sum([i.size for i in cigar if i.type=="I" or i.type=="D"]) 
 141      mism = sum([i.size for i in cigar if i.type=="M"]) - sum([int(s) for s in alngt.optional_field("MD") if s.isdigit()]) 
 142      nm = alngt.optional_field("NM") 
 143       
 144       
 145      return (TPRead(nm, readdic[alngt.read.name], alngt.aQual, alngt.iv.start, alngt.iv.end, gaps, mism)) 
  146       
 148      """ 
 149      Funtion which transforms an HTSeq alignment to a CustomRead class. 
 150   
 151      @type  alngt: alignment 
 152      @param alngt: alignment from the sam file. 
 153      @type  compareList: list 
 154      @param compareList: list which indicates differences between reference / artificial 
 155      @type  readdic: dictionary 
 156      @param readdic: Contains ranks and qualities for every entry in the fastq file 
 157       
 158      @rtype:   CustomRead 
 159      @return:  Transformed ReadObject. 
 160   
 161      """ 
 162      cigar = alngt.cigar 
 163       
 164      gaps  = sum([i.size for i in cigar if i.type=="I" or i.type=="D"]) 
 165       
 166       
 167       
 168      mism = sum([i.size for i in cigar if i.type=="M"]) - getsum(re.findall("(\d+)", alngt.optional_field("MD"))) 
 169       
 170       
 171      nm = alngt.optional_field("NM") 
 172   
 173       
 174      if len(alngt.read.qualstr) <= 1: 
 175          
 176          return (CustomRead(alngt.read.name,0,nm,getHammingdistance(compareList, alngt.iv.start, alngt.iv.end),readdic[alngt.read.name],alngt.aQual,alngt.iv.start,alngt.iv.end,gaps,mism)) 
 177      else: 
 178          return (CustomRead(alngt.read.name,0,nm,getHammingdistance(compareList, alngt.iv.start, alngt.iv.end),(alngt.read.qual.mean() / (alngt.read.qual.max() -  alngt.read.qual.min() + 1)),alngt.aQual,alngt.iv.start,alngt.iv.end,gaps,mism)) 
  179   
 181      """ 
 182      Function to get a dictionary containing the rank of a read (given a sorted samfile by readname, samtools). 
 183   
 184      @type  referenceSAM: string 
 185      @param referenceSAM: Inputfile name for reference SAM file. 
 186      @rtype:   dictionary 
 187      @return:  Internalnaming according to the sorting of samtools. Key = ReadID, Value = rank 
 188      """ 
 189      internalDic = {} 
 190   
 191       
 192      readnames = str(subprocess.check_output("""awk < %s -F "\t" '{print  $1 }'""" % referenceSAM, shell=True)) 
 193   
 194       
 195      readnames = readnames.split("\n") 
 196   
 197      i = 0 
 198       
 199      while readnames[i][0] == ("@"): 
 200          i += 1 
 201       
 202      index = 0 
 203      for k in xrange(i, len(readnames)): 
 204          if readnames[k] in internalDic: 
 205              pass 
 206          else: 
 207              internalDic[readnames[k]] = index 
 208               
 209              index += 1 
 210               
 211               
 212               
 213      return(internalDic) 
  214   
 215   
 217      """ 
 218      Function which returns the next line from a SAMFile. 
 219       
 220      @type  textfile: fileobject stream 
 221      @param textfile: SAMfile 
 222      @type  compareList: list 
 223      @param compareList: AA sequence of the reference genome. 
 224      @type  readdic: Dictionary 
 225      @param readdic: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed) 
 226      @rtype:   Readobj 
 227      @return: Parsed read from text line. 
 228      """ 
 229           
 230       
 231       
 232      line = textfile.readline() 
 233      if not line: 
 234          return(0) 
 235      CRead, dummy = readSAMline(line, "noMem", compareList, readdic) 
 236      return(CRead) 
  237   
 238   
 239   
 241      """ 
 242      Reads the alignment tag given, the text and a tag to search for 
 243       
 244      @type  mdtag: string 
 245      @param mdtag: MDTag from alignment 
 246      @type  cigar: string 
 247      @param cigar: Cigar from alignment 
 248      @rtype:   gaps,mismatches 
 249      @return: Parsed gaps and mismatches 
 250      """ 
 251      deletions = getsum(re.findall("(\d+)D", cigar)) 
 252      insertions = getsum(re.findall("(\d+)I", cigar)) 
 253      gaps = int(insertions + deletions) 
 254   
 255       
 256       
 257       
 258   
 259      mismatch = getsum(re.findall("(\d+)M", cigar)) - getsum(re.findall("(\d+)", mdtag)) 
 260   
 261    
 262       
 263      return(gaps, mismatch) 
  264   
 265 -def getAllID(textfile, read, compareList, readdic): 
  266      """ 
 267      Reads all alignments for the current read which are in the SAM file (sorted). If a new read ID is scanned the results are returned. 
 268       
 269      @type  textfile:  fileobject stream 
 270      @param textfile: SAM file for reading. 
 271      @type  read: read obj 
 272      @param read: the last read obj which defines the current read id 
 273      @type  readdic: dictionary 
 274      @param readdic: Look up for quality values. 
 275      @rtype:   (bool,list,bool) 
 276      @return: a "triple", where 2 bools are defined as indicator variables and a list with all alignments for one read. 
 277      """ 
 278       
 279      if read == 0: 
 280           
 281          CRead = getNextLine(textfile, compareList, readdic) 
 282          return(getAllID(textfile, CRead, compareList, readdic)) 
 283      else: 
 284           
 285          templist = [] 
 286          templist.append(read) 
 287       
 288      while 1: 
 289          line = textfile.readline() 
 290           
 291          if not line: 
 292              return(0, templist, 0) 
 293           
 294          try: 
 295             CRead = getCustomRead(HTSeq.SAM_Alignment.from_SAM_line(line), compareList, readdic) 
 296              
 297          except: 
 298              CRead, dummy = readSAMline(line, "noMem", compareList, readdic) 
 299           
 300          if CRead == 0: 
 301               
 302              return(1, templist, CRead) 
 303           
 304          if CRead.id == read.id: 
 305              templist.append(CRead) 
 306          else: 
 307              return(1, templist, CRead) 
  308           
 309       
 311      """ 
 312      Compares alignments for the reference and artificial reference for a specific read id. 
 313      The goal is to identify false positives. 
 314   
 315      @type  reflist: read obj list 
 316      @param reflist: list containing alignments witht the same ID (reference) 
 317      @type  artlist: read obj list 
 318      @param artlist: list containing alignments witht the same ID (artificial) 
 319      @rtype:   np.array 
 320      @return:  indices of unique alignments 
 321      """ 
 322      
 323       
 324      artlen = len(artlist) 
 325      reflen = len(reflist) 
 326   
 327      TPcount = reflen 
 328      FPcount = 0 
 329      for RefRead in reflist: 
 330           file.write(RefRead.toStrNoMem("ref")) 
 331       
 332       
 333      for i in xrange(0, artlen): 
 334           
 335          ArtInRef = 0 
 336          for j in xrange(0,reflen): 
 337              if artlist[i].start == reflist[j].start and artlist[i].end == reflist[j].end: 
 338                  ArtInRef = 1 
 339                  break 
 340           
 341          if (ArtInRef == 0): 
 342              FPcount+=1 
 343              file.write(artlist[i].toStrNoMem("art")) 
 344       
 345       
 346      return(TPcount,FPcount) 
  347       
 348  """def WriteToFile(file, list, identifier): 
 349       
 350      Writes read obj. to a file. 
 351   
 352   
 353      @type  file: read obj list 
 354      @param file: list containing alignments witht the same ID (reference) 
 355      @type  list: read obj. list 
 356      @param list: list containing alignments witht the same ID (artificial) 
 357      @type  identifier: string 
 358      @param identifier: indicator if the results are from the reference or artificial 
 359       
 360      file.write(getBestAlignment(list).toStrNoMem(identifier))""" 
 361   
 363      """ 
 364      Skips the header from a SAM file, but reads the first line of the alignment section. 
 365   
 366      @type  file: read obj list 
 367      @param file: list containing alignments witht the same ID (reference) 
 368      @type  compareList: list 
 369      @param compareList: list for accumulation of the same read id 
 370      @type  readdic: dictionary 
 371      @param readdic: dictionary containing read ID - read quality mappings. 
 372      @rtype:   read obj. 
 373      @return:  Returns a read obj. from the SAM file. 
 374      """ 
 375      while 1: 
 376          temp = file.readline() 
 377          if temp.startswith("@"): 
 378              pass 
 379          else: 
 380              return(readSAMline(temp, "noMem", compareList, readdic)) 
 381              break 
  382   
 383   
 385      """ 
 386      Function which returns the ranks of 2 given readIDs (read from reference,read from artificial). 
 387   
 388      @type  RefRead: read obj  
 389      @param RefRead: read obj  (reference) 
 390      @type  ArtRead: read obj  
 391      @param ArtRead: read obj  (artificial) 
 392      @type  rankdic: dictionary 
 393      @param rankdic: dictionary containing ranks of the read IDs (according to the sorted SAM files). 
 394      @rtype:   int,int 
 395      @return:  returns the false positives and true positives for a SAM file pair (reference, artificial) 
 396      """ 
 397       
 398      RefReadRank = rankdic[RefRead[0].id] 
 399      if ArtRead[0].id in rankdic: 
 400          ArtReadRank = rankdic[ArtRead[0].id] 
 401      else: 
 402           
 403          ArtReadRank = RefReadRank - 1 
 404           
 405      return(RefReadRank,ArtReadRank) 
  406   
 407   
 408 -def ReadSAMnoMem(ref, art, output, compareList, readdic, rankdic): 
  409      """ 
 410      Main function for comparing to mappings. This functions takes the complete alignments for artificial and reference genome and goes through them in parallel. 
 411      Since the mappings are sorted the function alternates the parsing of the samfiles in such a way that no memory is used for comparing these functions. 
 412   
 413      @type  ref: string 
 414      @param ref: path to reference alignments (SAM file) 
 415      @type  art: string 
 416      @param art: path to artificial alignments (SAM file) 
 417      @type  output: read obj  
 418      @param output: read obj  (artificial) 
 419      @type  compareList: read obj  
 420      @param compareList: read obj  (artificial) 
 421      @type  readdic: dictionary 
 422      @param readdic: dictionary containing read ID - read quality mappings. 
 423      @type  rankdic: dictionary 
 424      @param rankdic: dictionary containing ranks of the read IDs (according to the sorted SAM files). 
 425      @rtype:   ranks 
 426      @return:  Returns the ranks for the 2 read ids. 
 427      """ 
 428       
 429      fobj = open(output, "w") 
 430      fobj.write("#ReadID\tMatchedReference\tSubstitutions\tNMtag\tReadQuality\tMappingQuality\tStart\tEnd\tGaps\tMisM\r\n") 
 431      fileref = open(ref, "r") 
 432      fileart = open(art, "r") 
 433       
 434       
 435      CurrentReadRef, dummy = SkipHeader(fileref, compareList, readdic) 
 436      CurrentReadArt, dummy = SkipHeader(fileart, compareList, readdic)  
 437       
 438       
 439      hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 
 440      hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 
 441       
 442   
 443      tp = 0 
 444      fp = 0 
 445      MappedReads = 0 
 446      i = 0 
 447      start = time.time()            
 448       
 449      ArtBool = 0 
 450      RefBool = 0 
 451      while 1: 
 452          i += 1 
 453          if i == 100000: 
 454              print "\t\t%f" % (time.time() - start), 
 455           
 456          RefReadRank,ArtReadRank = getRanks(RefRead,ArtRead,rankdic) 
 457           
 458           
 459           
 460           
 461           
 462          while (RefReadRank > ArtReadRank and hasNextLine != 0): 
 463               
 464              for l in xrange(0, len(ArtRead)): 
 465                  ArtRead[l].mr = [0] 
 466               
 467               
 468              for hits in ArtRead: 
 469                  fobj.write(hits.toStrNoMem("art")) 
 470                  fp += 1 
 471   
 472               
 473               
 474              CurrentReadArt = NextReadArt 
 475               
 476              hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 
 477               
 478               
 479              ArtReadRank  = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 
 480              MappedReads+=1 
 481               
 482           
 483           
 484           
 485           
 486          while (RefReadRank < ArtReadRank and hasNextLine != 0): 
 487               
 488              for hits in RefRead: 
 489                  fobj.write(hits.toStrNoMem("ref")) 
 490                  tp += 1 
 491   
 492              CurrentReadRef = NextReadRef 
 493              hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 
 494              RefReadRank = rankdic[RefRead[0].id] 
 495              MappedReads+=1 
 496               
 497               
 498           
 499           
 500          if (RefReadRank == ArtReadRank): 
 501              MappedReads+=1 
 502               
 503               
 504               
 505              tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj) 
 506              tp +=tempTP 
 507              fp +=tempFP 
 508               
 509              CurrentReadRef = NextReadRef 
 510              CurrentReadArt = NextReadArt 
 511               
 512               
 513               
 514              hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 
 515              if hasNextLine == 0: 
 516                   
 517                  RefReadRank = rankdic[RefRead[0].id] 
 518                  RefBool = 1 
 519                  break 
 520               
 521              hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 
 522              if hasNextLine == 0: 
 523                   
 524                  ArtReadRank  = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 
 525                  ArtBool = 1 
 526                  break 
 527   
 528       
 529      hasNextLine = 1 
 530       
 531      if RefBool == 1: 
 532           
 533           
 534          hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 
 535          ArtReadRank  = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 
 536           
 537          while (RefReadRank > ArtReadRank and hasNextLine != 0): 
 538              MappedReads+=1 
 539               
 540              for l in xrange(0, len(ArtRead)): 
 541                  ArtRead[l].mr = [0] 
 542       
 543                
 544                
 545              for hits in ArtRead: 
 546                  fobj.write(hits.toStrNoMem("art")) 
 547                  fp += 1 
 548               
 549              CurrentReadArt = NextReadArt 
 550               
 551              hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 
 552       
 553               
 554              ArtReadRank  = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 
 555               
 556           
 557           
 558          if (RefReadRank == ArtReadRank): 
 559              MappedReads+=1 
 560               
 561               
 562              tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj) 
 563              tp +=tempTP 
 564              fp +=tempFP 
 565               
 566               
 567          while 1: 
 568               
 569              if hasNextLine == 0: 
 570                  break 
 571              else: 
 572                  
 573                  CurrentReadArt = NextReadArt 
 574                   
 575                  hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 
 576                   
 577               
 578              for hits in ArtRead: 
 579                  fobj.write(hits.toStrNoMem("art")) 
 580                  fp += 1 
 581              MappedReads+=1 
 582               
 583      else: 
 584           
 585          while (RefReadRank < ArtReadRank and hasNextLine != 0): 
 586              MappedReads+=1 
 587           
 588              for hits in RefRead: 
 589                  fobj.write(hits.toStrNoMem("ref")) 
 590                  tp += 1 
 591              CurrentReadRef = NextReadRef 
 592              hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 
 593              RefReadRank = rankdic[RefRead[0].id] 
 594           
 595           
 596          if (RefReadRank == ArtReadRank): 
 597              MappedReads+=1 
 598               
 599               
 600              tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj) 
 601              tp +=tempTP 
 602              fp +=tempFP 
 603   
 604           
 605          else: 
 606              MappedReads+=1 
 607              for hits in ArtRead: 
 608                  fobj.write(hits.toStrNoMem("art")) 
 609                  fp += 1 
 610           
 611          while 1: 
 612                  if hasNextLine == 0: 
 613                      MappedReads+=1 
 614                      for hits in RefRead: 
 615                          fobj.write(hits.toStrNoMem("ref")) 
 616                          tp += 1 
 617                       
 618                      break 
 619                  else: 
 620                      CurrentReadRef = NextReadRef 
 621                      MappedReads+=1 
 622                      for hits in RefRead: 
 623                          fobj.write(hits.toStrNoMem("ref")) 
 624                          tp += 1 
 625                  hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 
 626   
 627      end = time.time() 
 628      print ("\t\t%f\t\t" %(end - start)), 
 629      fobj.close() 
 630      return(tp, fp,tp+fp,MappedReads) 
  631   
 632   
 633   
 635      """ 
 636      Reads the alignment tag, given the text and a tag to search for. 
 637   
 638      @type  line: string 
 639      @param line: SAM line 
 640      @type  tag: string 
 641      @param tag: SAM tag. i.e: NM,MD 
 642      @rtype:   int 
 643      @return:  number x behind desired tag tag:i:x 
 644      """ 
 645   
 646       
 647       
 648      m = re.search(tag + ":i:(\d+)", line) 
 649      besthits = m.group(1) 
 650         
 651      return(besthits) 
  652   
 654      """ 
 655      Sum of strings (as ints) 
 656   
 657      @type  strlist: list(str) 
 658      @param strlist: SAM line 
 659   
 660      @rtype:   int 
 661      @return:  MD tag calculation. 
 662      """ 
 663   
 664      return(sum([int(i) for i in strlist])) 
  665   
 667      """ 
 668      Mean of strings. 
 669   
 670      @type  strlist: list(str) 
 671      @param strlist: SAM line 
 672   
 673      @rtype:   float 
 674      @return:  mean 
 675      """ 
 676      sum = getsum(strlist) 
 677      return(sum/float(len(strlist))) 
  678   
 679   
 681      """ 
 682      Computes the ReadQualityScore given a quality string 
 683   
 684      @type  quality: string 
 685      @param quality: quality string of a read. 
 686   
 687      @rtype:   float 
 688      @return:  ReadQualityScore (RQS) 
 689      """ 
 690       
 691      qualarray = [ord(i)-33 for i in quality] 
 692       
 693      score =( sum(qualarray)/float(len(qualarray))) / (max(qualarray) -  min(qualarray) + 1) 
 694   
 695      return (score) 
  696   
 697   
 699      """ 
 700      Computes the alignment length given a cigar string. Needed for Start + End calculation. 
 701   
 702      @type  cigar: string 
 703      @param cigar: Cigar string (SAM) 
 704   
 705      @rtype:   int 
 706      @return:  alignmentlength 
 707      """ 
 708       
 709       
 710       
 711       
 712      match = re.findall("(\d+)[MDN]", cigar) 
 713      sumOfMatches = 0 
 714      for item in match: 
 715          sumOfMatches += int(item) 
 716      return (sumOfMatches) 
  717   
 718   
 720      """ 
 721      Checks alignment line for unnecessary informations to skip  
 722   
 723      @type  alignment: string 
 724      @param alignment: Line from SAM 
 725      @type  identifier: string 
 726      @param identifier: read id 
 727      @type  compareList: list 
 728      @param compareList: list of alignments with same read id. 
 729      @type  readdic: dictionary 
 730      @param readdic: Dictionary containg a read id; read quality mapping 
 731      @rtype:   readobj, readname 
 732      @return:  Returns the read and it's identifier. 
 733      """ 
 734       
 735      if alignment.startswith("@"): 
 736          return(0, "") 
 737      else: 
 738          read, readname = readSAMline(alignment, identifier, compareList, readdic) 
 739          if read == 0: 
 740              return(0, "") 
 741          else: 
 742              return (read, readname) 
  743   
 744   
 746      """ 
 747      Function for comparison of artificial alignments and reference alignments. FP are defined such that start and end position must be unique to the artificial reference 
 748      returns 0 if no same read is found (FP found) 
 749      returns 1 if an equal alignment is found 
 750   
 751      @type  readref: read obj. 
 752      @param readref: reference 
 753      @type  readart: read obj. 
 754      @param readart: artificial 
 755      @rtype:   bool 
 756      @return:  Indicator if alignment is the same (start & end equal) 
 757      """ 
 758      for i, item in enumerate(readref.start): 
 759          if (readref.start[i] == readart.start[0] and readref.end[i] == readart.end[0]): 
 760              return(1) 
 761      return(0) 
  762   
 763   
 764   
 766      """ 
 767      Computes the number of subsitutions in the artificial reference using the CompareString. 
 768   
 769      @type  CompareString: string 
 770      @param CompareString: string of 0 and 1s. 1 = hamming 1 between reference and artificial. 
 771      @type  start: int 
 772      @param start: start of alignment 
 773      @type  end: int 
 774      @param end: end ofalignment 
 775      @rtype:   int 
 776      @return:  hamming distance 
 777      """ 
 778      return (sum(CompareString[start:end])) 
  779       
 780   
 782      """ 
 783       
 784      Reads a .fastqfile and calculates a defined readscore  
 785      input: fastq file 
 786      output: fastq dictionary key = readid; value = qualstr  
 787   
 788   
 789      @type  fastqfile: string 
 790      @param fastqfile: path to fastq file 
 791   
 792      @rtype:   dictionary 
 793      @return:  dictionary containing read ids and read qualities. 
 794      """ 
 795      fastq_file = HTSeq.FastqReader(fastqfile , "phred") 
 796      readdictionary = {} 
 797       
 798      for read in fastq_file: 
 799          readdictionary[read.name] = ComputeRQScore(read.qualstr) 
 800      print("\tReading Fastq file done!") 
 801      return readdictionary 
  802   
 803   
 804   
 805   
 807      """ 
 808      Extends a given dictionary with KEY = readid and VALUE = qualstr such that an internal naming is generated which can be used to efficiently create an numpy array 
 809   
 810   
 811      @type  readdic: dictionary 
 812      @param readdic: dictionary containing read ids and read qualities. 
 813   
 814      @rtype:   dictionary 
 815      @return:  extended readdic with KEY = ID, VALUE = READID object with READID.internalid and READID.qulstr = qualstr 
 816      """ 
 817      internalnaming = 0 
 818      reverseReadDic = np.zeros(len(readdic), dtype='S50') 
 819      for id in readdic.iterkeys(): 
 820           
 821          readdic[id] = ReadID(internalnaming, readdic[id]) 
 822          reverseReadDic[internalnaming] = id 
 823          internalnaming += 1 
 824      return(readdic, reverseReadDic) 
  825   
 826            
 827   
 829      """ 
 830      Returns a sequence string from a fasta file. 
 831   
 832   
 833      @type  fasta: string 
 834      @param fasta: path to fasta file. 
 835   
 836      @rtype:   string 
 837      @return:  sequence 
 838      """ 
 839      fastafile = HTSeq.FastaReader(fasta) 
 840      for sequence in fastafile: 
 841          return(sequence.seq) 
  842       
 843       
 844      
 846      """ 
 847      Creates a list which is used for comparisons between aligned reads (exact number of mismatches)  
 848   
 849   
 850   
 851      @type  Reference: string 
 852      @param Reference: reference genome 
 853      @type  ARG: string 
 854      @param ARG: artificial reference genome. 
 855      @rtype:   list 
 856      @return:  list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal. 
 857      """ 
 858       
 859      reference = returnSequence(Reference) 
 860      artificialreference = returnSequence(ARG) 
 861      complist = [] 
 862       
 863       
 864      if (len(reference) != len(artificialreference)): 
 865          print "first 10 letter:\t\t" + reference[0:10] + "..." + reference[-10:] 
 866          print "last 10 letter:\t\t" + artificialreference[0:10] + "..." + artificialreference[-10:] 
 867          print ("Error! Two Sequences have different length! Try to add a line break after the last nucleotide.") 
 868          sys.exit(1) 
 869      else: 
 870          return([ 0 if x == artificialreference[i] else 1 for i, x in enumerate(reference)]) 
  871       
 872   
 873   
 874 -def readSAMline(alignment, identifier, compareList, readdic): 
  875      """ 
 876      Function for reading SAM alignment text file (one line) 
 877   
 878   
 879   
 880      @type  alignment: string 
 881      @param alignment: SAM alignment 
 882      @type  identifier: string 
 883      @param identifier: read id 
 884      @type  compareList: list 
 885      @param compareList: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal. 
 886      @type  readdic: dictionary 
 887      @param readdic: dictionary containing read ids and read qualities. 
 888      @rtype:   read obj. 
 889      @return:  returns a customRead object 
 890      """ 
 891      k = 0 
 892   
 893       
 894      columns = alignment.split("\t") 
 895      flag = int(columns[1].strip()) 
 896       
 897       
 898      if flag == 4: 
 899          return(0, "") 
 900           
 901      else: 
 902          
 903           
 904          readname = columns[0].strip() 
 905          reference = columns[2].strip() 
 906          start = int(columns[3].strip())-1 
 907          mappingquality = columns[4].strip() 
 908          cigar = columns[5].strip() 
 909          qualstr = columns[10].strip() 
 910          tags = " ".join(columns[11:]) 
 911           
 912          mdtag = re.search("MD:Z:([^\s]+)", tags).group(1) 
 913          try: 
 914              mdtag = re.search("MD:Z:([^\s]+)", tags).group(1) 
 915              gaps, mism = getMisGap(mdtag, cigar) 
 916          except: 
 917              gaps = 0 
 918              mism = 0 
 919              print "Error Reading MDtag of %s.Setting gaps = 0,mism = 0" % readname 
 920          try: 
 921              nm = int(getNumberOf(tags, "NM")) 
 922          except: 
 923              nm = 0 
 924           
 925          leng = getAlignmentLength(cigar) 
 926           
 927          if len(qualstr) == 1 or (len(qualstr.strip()) == 0): 
 928              score = readdic[readname].quality 
 929          else: 
 930              score = ComputeRQScore(qualstr) 
 931           
 932          if identifier == "art": 
 933               
 934              tempobj = CustomRead(readname, 0, nm, getHammingdistance(compareList, start, start + leng), score, mappingquality, start, start + leng, gaps, mism) 
 935          elif identifier == "noMem": 
 936              tempobj = CustomRead(readname, 1, nm, getHammingdistance(compareList, start, start + leng), score, mappingquality, start, start + leng, gaps, mism) 
 937               
 938          else: 
 939               
 940              tempobj = TPRead(nm, score, mappingquality, start, start + leng, gaps, mism) 
 941   
 942   
 943   
 944      return(tempobj, readname) 
  945   
 946   
 947   
 948   
 949   
 951      """ 
 952      Returns the index of a read. The index is prescribed by the ordering in the sam file. 
 953      @type  readname: string 
 954      @param readname: read id 
 955      @type  readdic: dictionary 
 956      @param readdic: dictionary containing read ids and read qualities.ArtRead 
 957      @rtype: int 
 958      @return:  index 
 959      """ 
 960      return(readdic[readname].internalID) 
  961   
 962   
 964      """ 
 965      Function for reading the artificial reference genome using HTSeq.This function is mainly used. Only if no quality string is in the SAM line. The custom 
 966      SAM reading function is used. 
 967       
 968      @type  art: string 
 969      @param art: artificial file. 
 970      @type  RefArray: array 
 971      @param RefArray: Results from reading the reference SAM file. 
 972      @type  compareList: list 
 973      @param compareList: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal. 
 974      @type  readdic: dictionary 
 975      @param readdic: dictionary containing read ids and read qualities. 
 976      @rtype:   array 
 977      @return:  aligned read objects in an array. 
 978      """ 
 979      start = time.time() 
 980       
 981      fobj = open(art, "r") 
 982      artdic = {} 
 983      k = 0 
 984       
 985      read = SkipHeader(fobj,compareList,readdic) 
 986       
 987       
 988      for alignment in fobj: 
 989          k += 1 
 990          if k % 1000000 == 0: 
 991              print ("%d.." %(k/1000000)), 
 992          read, readname = isSaneAlignment(alignment, "art", compareList, readdic) 
 993          if read == 0: 
 994              pass 
 995          else: 
 996               
 997              index = returnIndex(readdic, readname) 
 998              if RefArray[index] != 0: 
 999                   
1000                  if (RefArray[index].isContained(read) == 0): 
1001                      if readname in artdic: 
1002                          artdic[readname].toObjself(read) 
1003                       
1004                      else: 
1005                          artdic[readname] = read 
1006                  else: 
1007                      pass 
1008                   
1009              else: 
1010                   
1011                  artdic[readname] = read 
1012              
1013    
1014      fobj.close() 
1015      end = time.time() 
1016       
1017      print ("\t %f " % (end - start)), 
1018       
1019      return(artdic) 
 1020   
1021   
1022   
1024      """ 
1025      Write function for hits ont he reference genome. Only the best alignment (least mismatches) 
1026      Header : 
1027      ReadID    MatchedReference     Substitutions     NumberOfMismatches     ReadQuality     MappingQuality 
1028       
1029   
1030      @type  outfile: string 
1031      @param outfile: Path to outfile. 
1032      @type  ReadDic: dictionary 
1033      @param ReadDic: dictionary containing read ids and read qualities. 
1034      """ 
1035   
1036      fobj = open(outfile, "a") 
1037      for read in ReadDic.keys(): 
1038         BestReadIndex = np.array(ReadDic[read].nm).argmin() 
1039         fobj.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (read, ReadDic[read].mr[BestReadIndex], ReadDic[read].subs[BestReadIndex], ReadDic[read].nm[BestReadIndex], ReadDic[read].rq, ReadDic[read].mq[BestReadIndex], ReadDic[read].start[BestReadIndex], ReadDic[read].end[BestReadIndex], ReadDic[read].gaps[BestReadIndex], ReadDic[read].mism[BestReadIndex])) 
1040      fobj.close() 
 1041   
1042   
1043   
1044   
1045 -def writeToTabRef(RefArray, outfile, reverseReadArray, ReadDic): 
 1046      """  
1047      Write function for hits ont he reference genome. Only the best alignment (least mismatches) 
1048       
1049      Header : 
1050      ReadID    MatchedReference     Substitutions     NumberOfMismatches     ReadQuality     MappingQuality 
1051      rows in the infile 
1052      @type  RefArray: string 
1053      @param RefArray: path to inputfile. 
1054      @type  outfile: int 
1055      @param outfile: rows in the infile 
1056      @type  reverseReadArray: dictionary 
1057      @param reverseReadArray: Contains reads = values and ranks = keys 
1058      @type  ReadDic: dictionary 
1059      @param ReadDic: dictionary containing read ids and read qualities. 
1060      """ 
1061   
1062      fobj = open(outfile, "a") 
1063      for i in xrange(0, len(RefArray)): 
1064          read = RefArray[i] 
1065           
1066          if read == 0: 
1067              continue 
1068          else: 
1069              readname = reverseReadArray[i] 
1070              index = returnIndex(ReadDic, readname) 
1071              BestReadIndex = np.array(RefArray[index].nm,dtype="int").argmin() 
1072              fobj.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, 1, 0, RefArray[index].nm[BestReadIndex], RefArray[index].rq, RefArray[index].mq[BestReadIndex], RefArray[index].start[BestReadIndex], RefArray[index].end[BestReadIndex], RefArray[index].gaps[BestReadIndex], RefArray[index].mism[BestReadIndex])) 
1073      fobj.close() 
 1074   
1075   
1076   
1078      """ 
1079      Reads the results from writeToTab in for plotting and analysis 
1080       
1081      Header : 
1082      ReadID    MatchedReference     Substitutions     NumberOfMismatches     ReadQuality     MappingQuality 
1083       
1084      @type  infile: string 
1085      @param infile: path to inputfile. 
1086      @type  arraysize: int 
1087      @param arraysize: rows in the infile 
1088   
1089      @rtype:   np.array(obj) 
1090      @return:  array for unique classified reads. 
1091      """ 
1092   
1093       
1094      fobj = open(infile, "r") 
1095      dataArray = np.arange(arraysize, dtype=object) 
1096      for i, line in enumerate(fobj):      
1097              if line.startswith("#"): 
1098                  continue 
1099              else: 
1100                  zeile = line.split("\t")  
1101                  id = zeile[0] 
1102                  mr = int(zeile[1]) 
1103                  subs = int(zeile[2]) 
1104                  nm = int(zeile[3]) 
1105                  rq = float(zeile[4]) 
1106                  mq = int(zeile[5]) 
1107                  gaps = int(zeile[8]) 
1108                  mism = int(zeile[9]) 
1109                   
1110                   
1111                  dataArray[i - 1] = CustomRead(id, mr, nm, subs, rq, mq, 0, 0, gaps, mism) 
1112                   
1113      return (dataArray) 
 1114   
1115   
1116   
1165   
1167      return(path.replace("~",os.path.expanduser("~"))) 
 1168       
1170      """ 
1171      Since we append in the program, we need to make sure no old files remain...  
1172   
1173      @type  controlDic: dictionary  
1174      @param controlDic: dictionary containing future filenames. 
1175      @type  mapper: string  
1176      @param mapper: current identifier mapper for which the results are written 
1177      @type  outpath: string 
1178      @param outpath: existing path, where the outfiles will be written. 
1179      """ 
1180      for artificial in controlDic["mapper"][mapper][1]: 
1181               filename = artificial.split("/")[-1] 
1182               outfile = mapper + "_" + artificial.split("/")[-1][:-4] + ".esam" 
1183                
1184               fobj = open(outpath + outfile, "w") 
1185               fobj.write("#ReadID\tMatchedReference\tSubstitutions\tNumberOfMismatches\tReadQuality\tMappingQuality\tStart\tEnd\tGaps\tMisM\r\n") 
1186               fobj.close() 
 1187