#!/usr/bin/env python # Jacob Joseph # 8 Apr 2011 # Verify Panther data insertion import os from insert_fasta import insert_fasta from JJutil import crc64 from Bio import SeqIO class verify_fasta( insert_fasta): def __init__(self, source_ver_id=None, custom_name_parser=None, gb_tax_id = None, fname = None): # initialize the database, but actually skip the insert_fasta # init since it also inserts super( insert_fasta, self).__init__() if custom_name_parser == 'ppod_species': self.parse_name_descr = self.parse_name_descr_ppod elif custom_name_parser == 'ppod_species_uniprot': self.parse_name_descr = self.parse_name_descr_ppod_withuniprot elif custom_name_parser is not None: assert False, "Unknown custom_name_parser: %s" % custom_name_parser else: self.parse_name_descr = self.parse_name_descr_basic self.source_ver_id = source_ver_id self.gb_tax_id = gb_tax_id self.fname = fname def verify(self): fd = open( self.fname, 'rU') name_dict = {} i = 0 for record in SeqIO.parse(fd, 'fasta'): (name, descr) = self.parse_name_descr( record) #print name, descr if name in name_dict: print "Repeated entry" print "Current", name, descr print "Prior", name_dict[name] else: name_dict[name] = (name, descr) self.compare_sequence( name, descr, record.seq.tostring()) #print "------------------" i += 1 print "Records:", i def fetch_seqs_with_string(self, seq_str_id): q = """SELECT seq_id FROM prot_seq JOIN prot_seq_version USING (seq_id) WHERE seq_str_id = %(seq_str_id)s AND source_ver_id = %(source_ver_id)s""" source_ver_id = self.source_ver_id return self.dbw.fetchcolumn(q, locals()) def compare_sequence( self, primary_acc, description, sequence): seq_length = len(sequence) seq_crc = crc64.CRC64digest(sequence) seq_mw = 0 # dummy molecular weight--a required field source_ver_id = self.source_ver_id q = """SELECT seq_id FROM prot_seq_version WHERE source_ver_id = %(source_ver_id)s AND primary_acc = %(primary_acc)s""" seq_id = self.dbw.fetchsingle(q, locals()) if seq_id is None: print "Missing seq_id for", primary_acc, description seq_str_id = self.lookup_prot_seq_str( sequence, seq_length, seq_crc, seq_mw, dont_insert=True) if seq_str_id is None: print "No seq_str_id for sequence" else: print "Found seq_str_id" , seq_str_id identical_seq_ids = self.fetch_seqs_with_string(seq_str_id) print "Corresponding seq_ids: ", identical_seq_ids return db_tax_id = self.fetch_gb_tax_id( seq_id) if db_tax_id != self.gb_tax_id: print "Incorrect tax_id for ", primary_acc, description, db_tax_id #self.compare_prot_seq( seq_id, primary_acc, description, # sequence, seq_length, seq_crc, seq_mw, # self.gb_tax_id) if __name__ == "__main__": source_ver_id = 36 #fname = os.path.expandvars("$HOME/tmp/fasta_gp/human.fasta") #gb_tax_id = 9606 #fname = os.path.expandvars("$HOME/tmp/fasta_gp/mouse.fasta") #gb_tax_id = 10090 fname = os.path.expandvars("$HOME/tmp/fasta_gp/sea_urchin.fasta") gb_tax_id = 7668 vf = verify_fasta( source_ver_id = source_ver_id, gb_tax_id = gb_tax_id, fname = fname, custom_name_parser = "ppod_species_uniprot" )