#!/usr/bin/env python from JJutil import mysqlutils import time def parse_hgnc_full(): """Parse fields in the full hgnc dump. Tab-delimited columns: hgnc_id, approved_symbol, approved_name, status, locus_type, previous_symbols, previous_names, aliases, chromosome, date_approved, date_modified, date_name_changed, accession_numbers, enzyme_id, entrez_gene_id, mgd_id, misc_ids, pubmed_ids, refseq_ids, gene_family_name, gdb_id_mapped, entrez_gene_id_mapped, omim_id_mapped, refseq_mapped, swissprot_id_mapped""" f = open(infile) for l in f: if l.startswith('HGNC ID'): continue # header entry = l.split('\t') if len(entry) != 25: print entry break # drop the trailing newline #if entry[-1] == "\n": entry = entry[:-1] # strip any whitespace for i in range(len(entry)): entry[i] = entry[i].strip() # set missing integer columns to NULL # hgnc_id, entrez_gene_id, entrez_gene_id_mapped, # omim_id_mapped for i in (0,14, 21, 22): if entry[i]=='': entry[i]='NULL' else: entry[i] = int(entry[i]) # format dates appropriately # fields 9, 10, 11 for i in range(9,12): if entry[i]=='': entry[i]='NULL' else: entry[i] = time.strftime( '%Y-%m-%d', time.strptime(entry[i], '%d/%m/%Y')) insert_hgnc_entry( entry) def insert_hgnc_entry( entry): q = """INSERT INTO %s VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" inputtup = (db_table,) + tuple(entry) inputtup = tuple(map(mysqlutils.quote, inputtup)) dbw.execute( q % inputtup) dbw.commit() return if __name__ == "__main__": dbhost = "goby.compbio.cs.cmu.edu" dbuser = "jmjoseph" dbpass = "pQ1rng8c" db = "DurandLab2" dbw = mysqlutils.dbwrap( dbhost, dbuser, dbpass, db) infile = "/net/goby/usr1/jmjoseph/tmp/hgnc_2007_05_21" db_table = "hgnc_20070521" parse_hgnc_full()