#!/usr/bin/env python

# Jacob Joseph
# 2008-01-23
# Configure a sequence set, and run blast upon it.

import getopt, sys
from DurandDB import blast_run, seq_set, blast_lock


def initialize_blast(sequence_set, query_seq_set = None):
    """Initialize a new blast run for a sequence set"""
    set_id = sequence_set.set_id

    if query_seq_set is None:
        query_set_id = set_id
        chunk_size = sequence_set.chunk_size
        max_chunk = sequence_set.max_chunk
        
    else:
        query_set_id = query_seq_set.set_id
        chunk_size = query_seq_set.chunk_size
        max_chunk = query_seq_set.max_chunk

    #max_chunk = 39
    
    # Blast parameters
    # Must be executable python code
    #param_str = "{'descriptions':0, 'alignments':num_sequences, 'expectation':num_sequences*10, 'search_length':num_residues**2, 'gap_open':11, 'gap_extend':1, 'nprocessors':2, 'oldengine':'F', 'composition_stats':2}"
    param_str = "{'descriptions':0, 'alignments':num_sequences, 'expectation':num_sequences*10, 'search_length':num_residues**2, 'gap_open':11, 'gap_extend':1, 'nprocessors':2, 'oldengine':'F'}"

    comment = 'blast-2.2.16'
    blastall_path = 'blast-2.2.16/bin/blastall'
    
    blastrun = blast_run.blast_run()

    br_id = blastrun.new_run(set_id, param_str, comment, blastall_path,
                             query_set_id = query_set_id)
    #br_id = 100
    blastlock = blast_lock.blast_lock( br_id = br_id, set_id = set_id,
                                       chunk_size = chunk_size,
                                       max_chunk = max_chunk,
                                       query_set_id = query_set_id)
    blastlock.init_chunks()

def custom_seqset():
    """Initialize a new sequence set for a blast run."""
    sequenceset = None
    
    #sequenceset = seq_set(  name='all_ygob_yeast',
    #			desc='4914, 4932, 4934, 5478, 28985, 33169, 340169; sp 50.9, tr 33.9',
    #			tax_ids=[4914,4932,4934,5478,28985,33169,340169],
    #			db_versions={'swissprot': '50.9', 'trembl': '33.9'}, set_id=77,
    #			chunk_size=1000)

    #sequenceset = seq_set(  name='s_cerevisiae',
    #			desc='4932; sp 50.9, tr 33.9',
    #			tax_ids=[4932],
    #			db_versions={'swissprot': '50.9', 'trembl': '33.9'},
    #			chunk_size=2000)

    #sequenceset = seq_set(  name='human',
    #			 desc='9606; swissprot 50.9, no fragments',
    #			 tax_ids=[9606],
    #			 db_versions={'swissprot': '50.9'},
    #			 nofragments=True,
    #			 chunk_size=2000)

    #sequenceset = seq_set(  name='d. melanogaster',
    #			desc='7227; swissprot 50.9, trembl 33.9, no fragments',
    #			tax_ids=[7227],
    #			db_versions={'swissprot': '50.9', 'trembl': '33.9'},
    #			nofragments=True,
    #			chunk_size=2000)

    #sequenceset = seq_set.seq_set(  name='mouse',
    #                               desc='10090; sp 50.9, no fragments',
    #                               tax_ids=[10090],
    #                               db_versions={'swissprot': '50.9'},
    #                               nofragments=True,
    #                               chunk_size=1000)

    #sequenceset = seq_set.seq_set(  name='s_cerevisiae',
    #                               desc='4932; sp 54.6, no fragments',
    #                               tax_ids=[4932],
    #                               db_versions={'swissprot': '54.6'},
    #                               nofragments=True,
    #                               chunk_size=1000)

    #sequenceset = seq_set.seq_set(  name='four_yeast',
    #                               desc='4932; sp 54.6, no fragments',
    #                               tax_ids=[4932],
    #                               db_versions={'swissprot': '54.6'},
    #                               nofragments=True,
    #                               chunk_size=1000)

    # sequnce set populated manually:
    # INSERT INTO prot_seq_set (name, description)
    # VALUES ('s_cerevisiae', 'sgd 2008-03-22 13:54:14 verified orf');
    # INSERT INTO prot_seq_set_member SELECT 89 as set_id, seq_id
    # FROM sgd_feature WHERE feature_type='ORF' and feature_qualifier='Verified'; 
    #sequenceset = seq_set.seq_set(  name='s_cerevisiae',
    #                               desc='sgd 2008-03-22 13:54:14 verified orf',
    #                               chunk_size=500, set_id=89)

    #sequenceset = seq_set.seq_set(  name='s_cerevisiae_bayanus',
    #                               desc='4931, 4931; sp 54.6, no fragments',
    #                               tax_ids=[4931, 4932],
    #                               nofragments=True,
    #                               db_versions={'swissprot': '54.6'},
    #                               chunk_size=1000)

    # YGOB 2
    # A gossypii: AAL186W - AGR407C
    # C glabrata: CAGL0A00143g - CAGL0M14113g
    # K lactis: KLLA0A00110g - KLLA0F28127g
    # K polysporus: Kpol_1000.1 - Kpol_1070.37
    # K waltii: Kwal_0.3 - Kwal_60.24937
    # S bayanus: Sbay_0.1 - Sbay_96.41
    # S castelli: Scas_3.1 - Scas_657.21d1
    # S cerevisiae: YAL068C - YPR204W
    # S kluyveri: Sklu_2423.11 - Sklu_2423.10

    #INSERT INTO prot_seq_set (name, description)
    #VALUES ('ygob_Scer', 'YGOB, version 2, S cervisiae'),
    #('ygob_pre_post', 'YGOB, version 2, S cervisiae, K lactis'),
    #('ygob_post_post', 'YGOB, version 2, S cervisiae, C glabrata'),
    #('ygob_four', 'YGOB, version 2, S cervisiae, C glabrata, A gossypii, K lactis'),
    #('ygob_nine', 'YGOB, version 2, All nine species: S cerevisiae, S bayanus, C glabrata, S castellii, K Polysporus, A gossypii, K waltii, K lactis, S kluyveri');

    #INSERT INTO prot_seq_set_member
    #SELECT 91 as set_id, seq_id
    #FROM prot_seq_version
    #WHERE source_ver_id=21
    #AND primary_acc like 'Y%';

    #INSERT INTO prot_seq_set_member
    #SELECT 92 as set_id, seq_id
    #FROM prot_seq_version
    #WHERE source_ver_id=21
    #AND (primary_acc like 'Y%'
    #OR primary_acc like 'KLLA%');

    #INSERT INTO prot_seq_set_member
    #SELECT 93 as set_id, seq_id
    #FROM prot_seq_version
    #WHERE source_ver_id=21
    #AND (primary_acc like 'Y%'
    #OR primary_acc like 'CAGL%');

    #INSERT INTO prot_seq_set_member
    #SELECT 94 as set_id, seq_id
    #FROM prot_seq_version
    #WHERE source_ver_id=21
    #AND (primary_acc like 'Y%'
    #OR primary_acc like 'CAGL%'
    #OR primary_acc like 'A%'
    #OR primary_acc like 'KLLA%');
    
    #INSERT INTO prot_seq_set_member
    #SELECT 95 as set_id, seq_id
    #FROM prot_seq_version
    #WHERE source_ver_id=21;

    # PPOD large cluster of 29792 sequences
    #INSERT INTO prot_seq_set (name, description)
    #VALUES ('ppod_29792', 'PPOD large cluster');

    #INSERT INTO prot_seq_set_member
    #SELECT 96 as set_id, seq_id
    #FROM prot_seq_version
    #WHERE source_ver_id=25;

    return sequenceset

def usage():
    print """If initializing a new blast run, be sure to perform this only once.
Please modify the source for custom_setset() in this file to
define a new set of sequences.  If continuing a previously defined
blast run, no options need be specified.  Only one br_id will be
processed before exiting.

General Options
--------------------------------------
     -h, --help
          Print this help message.

     -i, --initialize
          Initialize a new blast run.  If this option is specified,
          use the options from (only) one of the 'Initialize' blocks,
          below

     -R, --RESET
          Reset all incomplete chunks.  DANGEROUS! Be sure no nodes
          are currently processing.

     --single_chunk
          Process only a single chunk, then exit.  This is useful for
          cluster jobs

Initialize: Processing existing sequence sets
--------------------------------------------------------
     -s, --set_id <int>                  (required)
          Process an existing sequence set

     -q, --query_set_id <int>            (optional)
          If specified, queries are made only for sequences in this
          set, against those in set_id.  If unspecified, all sequences
          in set_id will be queried.  

     -c, --chunk_size <500>
          Number of queries to process per blast process.  This should
          be much smaller than the sequence set size to allow
          paralellization.  Defaults to 500.

Initialize: Standard form sequence sets (Uniprot only)
--------------------------------------------------------
     -n, --set_name <string>             (required)
          A (short) name for the created sequence set.

     -d, --set_descr <string>            (required)
          A (long) description of the created sequence set.

     -v, <e.g. 'swissprot::50.9'>        (required)
          Database name, and version, separated by two colons.
          Multiple -v options may be provided.

     -g, <int,int,...>                   (required)
          A list of one or more Genbank Taxonomic identifiers,
          separated by commas.

     -c, --chunk_size <500>
          See above.

     -e, --exclude_fragments
          Exclude sequences annotated as fragments.

Initialize: Custom Sequence sets
--------------------------------------------------------
     --custom_initialize
          Executes custom_seqset from this source file.  It must be
          edited first: Please modify the source for custom_setset()
          in this file to define a new set of sequences.
          
"""
    
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hiRs:q:c:n:d:v:g:e",
                                   ["help", "initialize", "RESET",
                                    "set_id=", "chunk_size=",
                                    "set_name=", "set_descr=",
                                    "custom_initialize",
                                    "exclude_fragments",
                                    "single_chunk"])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)
    help=False
    reset=False
    init=False
    single_chunk=False
    set_id = None
    query_set_id = None
    chunk_size = 500

    set_name = None
    set_descr = None
    set_dbs = {}
    set_gb_tax_ids = None
    exclude_frag = False

    custom_initialize = False
    for o, a in opts:
        if o in ("-h", "--help"):
            help=True
        if o in ("-i", "--initialize"):
            init=True
        if o in ("-R", "--RESET"):
            reset=True
        if o in ("--single_chunk",):
            single_chunk=True
        if o in ("-s", "--set_id"):
            set_id = int(a)
        if o in ("-q", "--query_set_id"):
            query_set_id = int(a)
        if o in ("-c", "--chunk_size"):
            chunk_size = int(a)
        if o in ("-n", "--set_name"):
            set_name = a
        if o in ("-d", "--set_descr"):
            set_descr = a
        if o in ("-v",):
            arr = a.split("::")
            assert len(arr)==2, "Unable to parse database: '%s'" % a
            set_dbs[ arr[0] ] = arr[1]
        if o in ("-g",):
            arr = a.split(',')
            set_gb_tax_ids = map(int, arr)
        if o in ("-e", "--exclude_fragments"):
            exclude_frag = True
        if o in ("--custom_initialize",):
            custom_initialize = True

    if help:
        usage()
        sys.exit()

    elif reset:
        blastlock = blast_lock.blast_lock()
        blastlock.reset_all()
        sys.exit()

    elif init:
        query_seq_set = None
        
        if custom_initialize:
            sequence_set = custom_seqset()
            
        elif set_id is not None:
            # only build the blast database if query set is specified
            sequence_set = seq_set.seq_set(chunk_size=chunk_size,
                                           set_id=set_id,
                                           write_chunks = (query_set_id is None))
            if query_set_id is not None:
                # build the query fasta files.  Don't build a blast database
                query_seq_set = seq_set.seq_set(chunk_size=chunk_size,
                                                set_id = query_set_id,
                                                write_blastdb=False)
            
        elif (not None in (set_name, set_descr, set_gb_tax_ids) and
              len(set_dbs) > 0):
            sequence_set = seq_set.seq_set(
                name = set_name, desc = set_descr,
                tax_ids = set_gb_tax_ids, db_version = set_dbs,
                chunk_size = chunk_size, nofragments = exclude_frag)
        else:
            print "Error: Unable to parse initialization arguments"
            usage()
            sys.exit()
            
        initialize_blast(sequence_set, query_seq_set = query_seq_set)

    else:
        pass

    # Begin chunk processing
    blastlock = blast_lock.blast_lock(debug=False)
    blastrun = blast_run.blast_run(debug=False)

    next_chunk = blastlock.next_chunk_i()
    br_id = blastlock.br_id
    #next_chunk = 29
    #br_id=100
    while next_chunk is not None:
        blastrun.continue_run( next_chunk, br_id=br_id)

        # this performs a commit
        blastlock.complete_chunk_i( next_chunk)

        #blastlock.dbw.commit()
        #break

        if single_chunk:
            break
    
        next_chunk = blastlock.next_chunk_i()


if __name__ == "__main__":
    # set to 1 to enable profiling
    profile = 0
    
    if profile:
        import profile
        s = "main()"
        profile.run(s)
    else:
        main()