#!/usr/bin/env python # Jacob Joseph # 29 Sept 2008 # Facilitate integration of arbitrary FASTA files to the DurandLab2 # database. import time, getopt, sys from Bio import SeqIO from insert_generic import insert_generic from JJutil import crc64 class insert_fasta(insert_generic): def __init__(self, source_name, source_version, fname, source_ver_id=None, custom_name_parser=None, gb_tax_id = None): insert_generic.__init__(self) self.source_name = source_name self.source_info[source_name] = { 'version': source_version, 'date': time.strftime('%Y-%m-%d %H:%M:%S') } # insert (or lookup) a prot seq source version if source_ver_id is not None: self.source_info[self.source_name]['source_ver_id']= source_ver_id else: self.lookup_prot_seq_source_ver( self.source_name, self.source_info[self.source_name]['version'], self.source_info[self.source_name]['date']) m = "*** Inserting sequences with source_ver_id='%d' ***" print m % self.source_info[self.source_name]['source_ver_id'] if custom_name_parser == 'ppod_species': self.parse_name_descr = self.parse_name_descr_ppod elif custom_name_parser == 'ppod_species_uniprot': self.parse_name_descr = self.parse_name_descr_ppod_withuniprot elif custom_name_parser is not None: assert False, "Unknown custom_name_parser: %s" % custom_name_parser else: self.parse_name_descr = self.parse_name_descr_basic self.parse_fasta( fname, gb_tax_id) def parse_name_descr_ppod(self, record): # e.g., >Arabidopsis thaliana|TAIR:gene:1005027766 l = record.description larr = l.split('|') name = larr[1] descr = larr[0] return (name, descr) def parse_name_descr_ppod_withuniprot(self, record): # e.g., # >AQUAE|ENTREZ:1193999|NCBI:NP_214325 # >BACTN|ENTREZ:1073534|NCBI:NP_809639 # >BRAJA|ENTREZ:1053640|NCBI:NP_768268 # >CAEBR|ENTREZ:5633788|NCBI:XP_001671128 # >CHICK|ENTREZ:422364|NCBI:XP_420335 # >ENTHI|ENTREZ:3404274|NCBI:XP_649976 # >LEIMA|ENTREZ:5648894|NCBI:XP_001687509 # >NEUCR|ENTREZ:3873340|NCBI:XP_957188 # >PLAYO|ENTREZ:3806726|NCBI:XP_729427 # >RAT|RGD:1311090|NCBI:XP_001057505 # >STRPU|ENTREZ:755434|NCBI:XP_001189843 # >TETTH|ENTREZ:4505939|NCBI:XP_001015290 # >ARATH|TAIR:locus:2078426|NCBI:NP_191191.2 # >BACSU|ENTREZ:939141|NCBI:NP_390014 # >DEIRA|ENTREZ:1798069|NCBI:NP_296367 # >GLOVI|ENTREZ:2600530|NCBI:NP_925075 # >LEPIN|ENTREZ:1152095|NCBI:NP_712933 # >METAC|ENTREZ:1473979|NCBI:NP_617010 # >MOUSE|MGI:MGI:3649014|NCBI:NP_001030076 # >ORYSJ|ENTREZ:3131451|NCBI:NP_039365 # >STRCO|ENTREZ:1099793|NCBI:NP_628523 # >SULSO|ENTREZ:1455383|NCBI:NP_341783 # >THEMA|ENTREZ:897476|NCBI:NP_228761 # >YEAST|SGD:S000028557|NCBI:NP_878108 # >GEOSL|ENTREZ:2808007|NCBI:YP_016331 # >DICDI|dictyBase:DDB_G0273975|NCBI:EAL70421.1 # >BOVIN|ENSEMBL:ENSBTAG00000036184|ENSEMBL:ENSBTAP00000047346 # >PANTR|ENSEMBL:ENSPTRG00000031326|ENSEMBL:ENSPTRP00000053827 # >CIOIN|ENSEMBL:ENSCING00000016250|ENSEMBL:ENSCINP00000028259 # >XENTR|ENSEMBL:ENSXETG00000027244|ENSEMBL:ENSXETP00000056950 # >FUGRU|ENSEMBL:ENSTRUG00000004123|ENSEMBL:ENSTRUP00000009764 # >CANFA|ENSEMBL:ENSCAFG00000009450|ENSEMBL:ENSCAFP00000013904 # >MACMU|ENSEMBL:ENSMMUG00000029023|ENSEMBL:ENSMMUP00000032206 # >ANOGA|ENSEMBL:AGAP000001|ENSEMBL:AGAP000001-PA # >MONDO|ENSEMBL:ENSMODG00000025437|ENSEMBL:ENSMODP00000008116 # >ORNAN|ENSEMBL:ENSOANG00000008004|ENSEMBL:ENSOANP00000012730 # >DANRE|ENSEMBL:ENSDARG00000041705|ENSEMBL:ENSDARP00000088909 # >ASHGO|ENTREZ:4619397|UniProtKB:Q75CP6 # >EMENI|ENTREZ:2876911|UniProtKB:P05147 # >CHLTA|ENTREZ:3687317|UniProtKB:Q3KM97 # >CHLAA|ENTREZ:5825221|UniProtKB:A9WKH6 # >ECOLI|ECOLI:EG10986-MONOMER|UniProtKB:P05100 # >DROME|FB:FBgn0010339|UniProtKB:P32234 # >CHLRE|ENTREZ:5728351|UniProtKB:P52908 # >HUMAN|ENSEMBL:ENSG00000166913|UniProtKB:P31946 # >SCHPO|GeneDB_Spombe:SPAC922.03|UniProtKB:Q9URX3 # >PSEA7|ENTREZ:5357561|UniProtKB:A6V1E9 # >CAEEL|WB:WBGene00003920|UniProtKB:P41932 l = record.description larr = l.split('|') if larr[2].startswith('UniProtKB:'): name = larr[2][10:] elif larr[1].startswith('MGI'): name = larr[1][4:] else: name = larr[1] descr = l return (name, descr) def parse_name_descr_basic(self, record): return (record.name, record.description) def parse_fasta(self, fname, gb_tax_id=None): source_ver_id = self.source_info[self.source_name]['source_ver_id'] fd = open( fname, 'rU') for record in SeqIO.parse( fd, 'fasta'): # Some fasta files come in nonstandard name formats. Parse # them with custom functions (name, descr) = self.parse_name_descr( record) #print name, descr self.insert_sequence( source_ver_id, name, descr, record.seq.tostring(), gb_tax_id) def compare_prot_seq( self, seq_id, primary_acc, description, sequence, seq_length, seq_crc, seq_mw, gb_tax_id): if not seq_id == self.compare_fa_descr( seq_id, description): m = "Sequence '%s' already exists with seq_id '%d'." % ( primary_acc, seq_id) m += "\nHowever, the FASTA description differs: '%s' != '%s'" % ( description, self.fetch_fa_descr( seq_id)) #assert False, m print "IGNORING ENTRY:", m return seq_id if not seq_id == self.compare_prot_seq_str( seq_id, sequence, seq_length, seq_crc, seq_mw): m = "Sequence '%s' already exists with seq_id '%d'." % ( primary_acc, seq_id) m += "\nHowever, the sequence tuple differs: '%s' != '%s'" % ( (sequence, seq_len, seq_crc, seq_mw), self.fetch_prot_seq_str( seq_id)) assert False, m # return None if not seq_id == self.compare_gb_tax_id( seq_id, gb_tax_id): m = "Sequence '%s' already exists with seq_id '%d'." % ( primary_acc, seq_id) m += "\nHowever, the taxonomy id differs: '%d' != '%d'" % ( self.fetch_tax_id(seq_id), gb_tax_id) assert False, m return seq_id def insert_sequence(self, source_ver_id, primary_acc, description, sequence, gb_tax_id): seq_length = len(sequence) seq_crc = crc64.CRC64digest(sequence) seq_mw = 0 # dummy molecular weight--a required field # insert only if this prot_seq_version entry does not already # exist. Allows one to resume runs in the event of database # issues, program crashes q = """SELECT seq_id FROM prot_seq_version WHERE source_ver_id = %(source_ver_id)s AND primary_acc = %(primary_acc)s""" seq_id = self.dbw.fetchsingle(q, locals()) if seq_id is not None and seq_id == self.compare_prot_seq( seq_id, primary_acc, description, sequence, seq_length, seq_crc, seq_mw, gb_tax_id): # already inserted pass else: # create a new seq_id. Dummy molecular weight seq_id = self.insert_prot_seq( sequence, seq_length, seq_crc, seq_mw) self.insert_fa_descr(seq_id, description) if gb_tax_id is not None: self.insert_gb_tax_id(seq_id, gb_tax_id) self.insert_prot_seq_version_simple( seq_id, source_ver_id, primary_acc) self.dbw.commit() return def insert_fa_descr(self, seq_id, description): i = """INSERT INTO fa_descr (seq_id, descr) VALUES (%(seq_id)s, %(description)s)""" self.dbw.execute(i, locals()) return def fetch_fa_descr(self, seq_id): q = """SELECT descr FROM fa_descr WHERE seq_id = %(seq_id)s""" fa_desc = self.dbw.fetchsingle(q, locals()) return fa_desc def compare_fa_descr(self, seq_id, description): fa_desc = self.fetch_fa_descr( seq_id) if fa_desc == description: return seq_id else: return None def insert_gb_tax_id(self, seq_id, tax_id): i = """INSERT INTO prot_seq_taxonomy (seq_id, tax_id) VALUES (%(seq_id)s, %(tax_id)s)""" self.dbw.execute(i, locals()) return def fetch_gb_tax_id(self, seq_id): q = """SELECT tax_id FROM prot_seq_taxonomy WHERE seq_id = %(seq_id)s""" return self.dbw.fetchsingle(q, locals()) def compare_gb_tax_id(self, seq_id, tax_id): gb_tax_id = self.fetch_gb_tax_id( seq_id) if gb_tax_id == tax_id: return seq_id else: return None def usage(): s = """DurandLab2 FASTA file inserter Usage: 'insert_fasta.py -f [options]' Options: -f, --fasta (Required) Path and filename of the FASTA file to insert. -i, --source_ver_id (Optional) -n, --source_name (Required if no '-i') -v, --source_version (Required if no '-i'] -g, --gb_tax_id (Optional) Label all sequences as having a specific taxonomy identifier. This is helpful when the input is split into separate files by species. -c, --custom_name_parser (Optional) Select a custom parser for the FASTA name and description fields. E.g., 'ppod_species' or 'ppod_species_uniprot'. If an existing source_ver_id is not specified, 'source_name' and 'source_version' are required. """ return s def main(): try: opts, args = getopt.getopt( sys.argv[1:], 'n:v:f:hi:c:g:', ['fasta=', 'source_ver_id=', 'source_name=', 'source_version=', 'help', 'custom_name_parser=', 'gb_tax_id=']) except getopt.GetoptError: print usage() raise help = False source_name = None source_version = None fasta_file = None source_ver_id = None custom_name_parser = None gb_tax_id = None for o, a in opts: if o in ('-h', '--help'): help = True if o in ('-n','--source_name'): source_name = a if o in ('-v','--source_version'): source_version = a if o in ('-f','--fasta'): fasta_file = a if o in ('-i','--source_ver_id'): source_ver_id = int(a) if o in ('-c','--custom_name_parser'): custom_name_parser = a if o in ('-g','--gb_tax_id'): gb_tax_id = int(a) if help or fasta_file is None: print usage() sys.exit() if source_ver_id is not None: pass elif (source_name is not None and source_version is not None): pass else: print "Error: Unable to execute with given parameters:" print "fasta_file:", fasta_file print "source_name:", source_name print "source_version:", source_version print "sourve_ver_id:", source_ver_id print "custom_name_parser:", custom_name_parser print "gb_tax_id:", gb_tax_id print usage() sys.exit() print "Inserting file:", fasta_file print "As source '%s', Version '%s'" % (source_name, source_version) in_fasta = insert_fasta( source_name, source_version, fasta_file, source_ver_id=source_ver_id, custom_name_parser = custom_name_parser, gb_tax_id = gb_tax_id) return in_fasta if __name__ == "__main__": in_fasta = main()