#!/usr/bin/env python

# Jacob Joseph
# 8 Apr 2011

# Verify Panther data insertion

import os
from insert_fasta import insert_fasta
from JJutil import crc64
from Bio import SeqIO

class verify_fasta( insert_fasta):

    def __init__(self,
                 source_ver_id=None, custom_name_parser=None,
                 gb_tax_id = None, fname = None):

        # initialize the database, but actually skip the insert_fasta
        # init since it also inserts
        super( insert_fasta, self).__init__()

        if custom_name_parser == 'ppod_species':
            self.parse_name_descr = self.parse_name_descr_ppod
        elif custom_name_parser == 'ppod_species_uniprot':
            self.parse_name_descr = self.parse_name_descr_ppod_withuniprot
        elif custom_name_parser is not None:
            assert False, "Unknown custom_name_parser: %s" % custom_name_parser
        else:
            self.parse_name_descr = self.parse_name_descr_basic


        self.source_ver_id = source_ver_id
        self.gb_tax_id = gb_tax_id
        self.fname = fname


    def verify(self):
        fd = open( self.fname, 'rU')

        
        name_dict = {}
        i = 0
        for record in SeqIO.parse(fd, 'fasta'):
            (name, descr) = self.parse_name_descr( record)
            #print name, descr

            if name in name_dict:
                print "Repeated entry"
                print "Current", name, descr
                print "Prior", name_dict[name]
            else:
                name_dict[name] = (name, descr)

            self.compare_sequence( name, descr, record.seq.tostring())

            #print "------------------"

            i += 1
        print "Records:", i

    def fetch_seqs_with_string(self, seq_str_id):

        q = """SELECT seq_id
        FROM prot_seq
        JOIN prot_seq_version USING (seq_id)
        WHERE seq_str_id = %(seq_str_id)s
        AND source_ver_id = %(source_ver_id)s"""

        source_ver_id = self.source_ver_id
        
        return self.dbw.fetchcolumn(q, locals())


    def compare_sequence( self, primary_acc, description, sequence):
        seq_length = len(sequence)
        seq_crc = crc64.CRC64digest(sequence)
        seq_mw = 0 # dummy molecular weight--a required field

        source_ver_id = self.source_ver_id
        
        q = """SELECT seq_id FROM prot_seq_version
        WHERE source_ver_id = %(source_ver_id)s
        AND primary_acc = %(primary_acc)s"""

        seq_id = self.dbw.fetchsingle(q, locals())

        if seq_id is None:
            print "Missing seq_id for", primary_acc, description

            seq_str_id = self.lookup_prot_seq_str( sequence, seq_length,
                                                   seq_crc, seq_mw,
                                                   dont_insert=True)
            if seq_str_id is None:
                print "No seq_str_id for sequence"
            else:
                print "Found seq_str_id" , seq_str_id
                identical_seq_ids = self.fetch_seqs_with_string(seq_str_id)
                print "Corresponding seq_ids: ", identical_seq_ids

            return

        db_tax_id = self.fetch_gb_tax_id( seq_id)
        if db_tax_id != self.gb_tax_id:
            print "Incorrect tax_id for ", primary_acc, description, db_tax_id

        

        #self.compare_prot_seq( seq_id, primary_acc, description,
        #                       sequence, seq_length, seq_crc, seq_mw,
        #                       self.gb_tax_id)

if __name__ == "__main__":

    source_ver_id = 36
    #fname = os.path.expandvars("$HOME/tmp/fasta_gp/human.fasta")
    #gb_tax_id = 9606
    #fname = os.path.expandvars("$HOME/tmp/fasta_gp/mouse.fasta")
    #gb_tax_id = 10090
    fname = os.path.expandvars("$HOME/tmp/fasta_gp/sea_urchin.fasta")
    gb_tax_id = 7668


    vf = verify_fasta( source_ver_id = source_ver_id,
                       gb_tax_id = gb_tax_id,
                       fname = fname,
                       custom_name_parser = "ppod_species_uniprot"
                       )


