#!/usr/bin/env python

# Jacob Joseph
# 20 October 2009

# A quick plot to compare different scoring runs by a
# scatterplot. Fetch one query at a time from each axis, and run
# though them to align targets.  Draw a scatterplot.  Keep track of
# those missing from either. Draw a 2 histograms of missing scores.

# Actually, instead of scatterplots, use heatmaps with high
# granularity to avoid the memory of very many points, and better
# visualize the count a particular point.

import matplotlib
from matplotlib import pyplot
from DurandDB import blastq
import numpy, cProfile, time
import scipy.weave as weave

class run:
    def __init__(self, stype, run_id, set_id_filter, symmetric, descr=None):
        self.stype = stype
        self.run_id = run_id
        self.nc_id = run_id if stype=='nc_score' else None
        self.br_id = run_id if stype in ('bit_score','e_value') else None
        self.set_id_filter = set_id_filter
        self.symmetric= symmetric
        self.descr = descr

    def __repr__(self):
        s = "%s_%s_%s_%s_%s" % (self.stype, self.nc_id, self.br_id, self.set_id_filter,
                                self.symmetric)
        return s
    
    def __str__(self):
        s = "%s,%s,%s,%s,%s\n%s" % (self.stype, self.nc_id, self.br_id, self.set_id_filter,
                                    self.symmetric,
                                    self.descr if self.descr is not None else "")
        return s
        
        
class scatter:
    
    def __init__(self, run_0, run_1, hres=None, vres=None):
        self.runs = [run_0, run_1]
        self.bq = blastq.blastq(cacheq=False)

        self.hres = 1000 if hres is None else hres
        self.vres = 1000 if vres is None else vres

        self.unique = None

        self.fetch_scatter_weave()

    def fetch_scatter(self, hres=1000, vres=1000):
        r0 = self.runs[0]
        r1 = self.runs[1]

        # fetch the sequence sets.  We'll then take the union,
        # and iterate through
        seq_set = set(self.bq.fetch_seq_set(br_id = r0.br_id,
                                            nc_id = r0.nc_id,
                                            set_id = r0.set_id_filter))
        seq_set.update(self.bq.fetch_seq_set(br_id = r1.br_id,
                                             nc_id = r1.nc_id,
                                             set_id = r1.set_id_filter))

        map_both = numpy.zeros([hres, vres], dtype=numpy.int32)
        map_0 = numpy.zeros(hres, dtype=numpy.int32)
        map_1 = numpy.zeros(hres, dtype=numpy.int32)

        # FIXME: transformation is specific to NC.
        for i, query_seq_id in enumerate(seq_set):
            print i, query_seq_id
            if i > 50: return (1,2,3)
            hitdict_0 = self.bq.fetch_hits_dictarray(
                stype=r0.stype, br_id = r0.br_id, nc_id = r0.nc_id,
                symmetric = r0.symmetric, set_id_filter = r0.set_id_filter,
                query_seq_id = query_seq_id)
            if query_seq_id in hitdict_0:
                h0, s0 = hitdict_0[query_seq_id]
            else:
                h0 = []
                s0 = None

            hitdict_1 = self.bq.fetch_hits_dictarray(
                stype=r1.stype, br_id = r1.br_id, nc_id = r1.nc_id,
                symmetric = r1.symmetric, set_id_filter = r1.set_id_filter,
                query_seq_id = query_seq_id)
            if query_seq_id in hitdict_1:
                h1, s1 = hitdict_1[query_seq_id]
            else:
                h1 = []
                s1 = None
            
            i0 = 0
            i1 = 0
            len0 = len(h0)
            len1 = len(h1)
            # Hits are in sorted order
            while i0 < len0 and i1 < len1:
                if h0[i0] < h1[i1]:
                    map_0[ int(s0[i0] * hres)] += 1
                    i0 += 1
                    continue
                elif h0[i0] > h1[i1]:
                    map_1[ int(s1[i1] * vres)] += 1
                    i1 += 1
                    continue

                #print h0[i0], h1[i1], s0[i0], s1[i1]
                #FIXME: not robust, specific to NC
                map_both[ int(s0[i0] * hres), int(s1[i1] * vres)] += 1
                i0 += 1
                i1 += 1

            #print hitdict_0
            #print hitdict_1
            #print query_seq_id
            #return

        return map_0, map_1, map_both

    def fetch_scatter_weave(self, hres=1000, vres=1000):
        r0 = self.runs[0]
        r1 = self.runs[1]

        # fetch the sequence sets.  We'll then take the union,
        # and iterate through
        seq_set = set(self.bq.fetch_seq_set(br_id = r0.br_id,
                                            nc_id = r0.nc_id,
                                            set_id = r0.set_id_filter))
        seq_set.update(self.bq.fetch_seq_set(br_id = r1.br_id,
                                             nc_id = r1.nc_id,
                                             set_id = r1.set_id_filter))

        map_both = numpy.zeros([hres, vres], dtype=numpy.int32)
        map_0 = numpy.zeros(hres, dtype=numpy.int32)
        map_1 = numpy.zeros(hres, dtype=numpy.int32)

        self.map_both = map_both
        self.map_0 = map_0
        self.map_1 = map_1

        # FIXME: transformation is specific to NC.
        for i, query_seq_id in enumerate(seq_set):
            #print i, query_seq_id

            hitdict_0 = self.bq.fetch_hits_dictarray(
                stype=r0.stype, br_id = r0.br_id, nc_id = r0.nc_id,
                symmetric = r0.symmetric, set_id_filter = r0.set_id_filter,
                query_seq_id = query_seq_id)
            if query_seq_id in hitdict_0:
                hits_0, scores_0 = hitdict_0[query_seq_id]
            else:
                hits_0 = numpy.array([], dtype=numpy.int32)
                scores_0 = numpy.array([], dtype=numpy.int32)

            hitdict_1 = self.bq.fetch_hits_dictarray(
                stype=r1.stype, br_id = r1.br_id, nc_id = r1.nc_id,
                symmetric = r1.symmetric, set_id_filter = r1.set_id_filter,
                query_seq_id = query_seq_id)
            if query_seq_id in hitdict_1:
                hits_1, scores_1 = hitdict_1[query_seq_id]
            else:
                hits_1 = numpy.array([], dtype=numpy.int32)
                scores_1 = numpy.array([], dtype=numpy.int32)

            code = """
            int ind_0 = 0, ind_1 = 0;

            while( (ind_0 < Nhits_0[0]) && (ind_1 < Nhits_1[0]))
            {
                int h0 = hits_0[ind_0];
                int h1 = hits_1[ind_1];

                if (h0 < h1) {
                  map_0[ (int)(scores_0[ind_0] * hres)] += 1;
                  ind_0++;
                  continue;
                }
                else if (h0 > h1) {
                  map_1[ (int)(scores_1[ind_1] * vres)] += 1;
                  ind_1++;
                  continue;
                }

                /* if (scores_0[ind_0] - scores_1[ind_1] < -0.1) {
                         printf("%d, %d, %f, %f, %f\\n", query_seq_id, h1,
                         scores_0[ind_0], scores_1[ind_1],
                         scores_0[ind_0] - scores_1[ind_1]);
                } */
                MAP_BOTH2( (int)(scores_0[ind_0] * hres),
                           (int)(scores_1[ind_1] * vres)) += 1;
                ind_0 += 1;
                ind_1 += 1;
            }

            return_val = 0;
            """
            
            ret = weave.inline( code, ['hits_0', 'scores_0',
                                       'hits_1', 'scores_1',
                                       'map_0', 'map_1', 'map_both',
                                       'vres', 'hres', 'query_seq_id'])
        return

    def plotmap(self, title=None, fprefix=None):
        
        map_both1 = self.map_both + 1
        # pyplot.pcolor(map_both1, cmap=pyplot.cm.hot,
        pyplot.imshow(map_both1, cmap=pyplot.cm.hot,
                      norm = matplotlib.colors.LogNorm(),
                      origin='lower')
        pyplot.plot([0,1000],[0,1000], 'b--')
        pyplot.axis('image')
        pyplot.xlabel( str(self.runs[1]))
        pyplot.ylabel( str(self.runs[0]))

        pyplot.subplots_adjust(bottom=0.1, left=0.12, top=0.95, right=0.92)

        pyplot.colorbar(fraction=0.05,pad=0.01)

        if title is not None:
            pyplot.suptitle(title)


        #l 0.12 b .1 r .9 t .9 w .2 h .2
        
        if fprefix is not None:
            fname = (time.strftime('%Y%m%d')
                     + '_'
                     + fprefix + repr(self.runs[0])
                     + '_vs_'
                     + repr(self.runs[1]))
            pyplot.savefig( 'figures/%s' % fname)
            pyplot.close()
        else:
            pyplot.show()
        return
        
if __name__ == "__main__":
    # (score_type, run_id, set_id_filter, symmetric)
    #s = scatter( run('nc_score', 746, 105, True),
    #              run('nc_score', 750, 105, True))

    #s = scatter( run('nc_score', 777, None, False),
    #             run('nc_score', 779, None, True))

    #r_777 = run('nc_score', 777, None, False, 'Not Symmetric, Composition=2, 12 species')
    #r_777hm = run('nc_score', 777, 110, False, 'Not Symmetric, Composition=2, families')
    #r_779 = run('nc_score', 779, None, True, 'Symmetric, Composition=2, 12 species')
    #r_779hm = run('nc_score', 779, 110, True, 'Symmetric, Composition=2, families')
    #r_780 = run('nc_score', 780, None, True, 'Not Symmetric, Composition=f, 12 species')
    #r_780hm = run('nc_score', 780, 110, True, 'Not Symmetric, Composition=f, families')
    #r_782hmblast = run('nc_score', 782, 109, 'Symmetric, Composition=f, NC from blast of human, mouse only')
    #r_789 = run('nc_score', 789, None, True, "48 Genomes, no log")
    r_808 = run('nc_score', 808, None, True, "48 Genomes. Correct calculation")

    r_822 = run('nc_score', 822, None, True, "48 Genomes. Using blast_hit_limit >=50")
    r_823 = run('nc_score', 823, None, True, "48 Genomes. Using blast_hit_limit >=100")
    r_824 = run('nc_score', 824, None, True, "48 Genomes. Using blast_hit_limit >=500")

    #cProfile.runctx('map_0, map_1, map_both = s.fetch_scatter_weave()', globals(), locals())
    #cProfile.runctx('map_0, map_1, map_both = s.fetch_scatter()', globals(), locals())
    #map_0, map_1, map_both = s.fetch_scatter_weave()

    pyplot.ioff()
    pyplot.rc('figure', figsize=(9,8))
    
    #s = scatter( r_777, r_779)
    #s = scatter( r_777, r_780)
    #s = scatter( r_779, r_780)
    #s = scatter( r_777hm, r_779hm)
    #s = scatter( r_777hm, r_780hm)
    #s = scatter( r_779hm, r_780hm)
    #s = scatter( r_780, r_782hmblast)
    #s = scatter( r_789, r_808)
    #s = scatter( r_816, r_808)
    #s = scatter( r_822, r_808)
    #s = scatter( r_823, r_808)
    s = scatter( r_824, r_808)

    s.plotmap( fprefix='ncncscatter')


def foo():
    logcnt = [ math.log10(c) for c in cnt]

    pyplot.scatter( bit, e, c=logcnt, marker='o')
    pyplot.axis('tight')
    pyplot.grid(color='gray', alpha=0.5)
    pyplot.colorbar()
    pyplot.xlabel('Bit-score')
    pyplot.ylabel('-log10(e_value/dbsize + 1E-200)')
    pyplot.subplots_adjust(left=0.1, bottom=0.05, right=0.98, top=0.95)
    pyplot.title("E-value vs Bit score (br_id: 91)")


    #pyplot.pcolor(map_both, cmap=pyplot.cm.hot)

    # color based upon the log of the number
    # Take a log, then normalize to a 0-1 range?
    #logmap_both = numpy.log10(numpy.log10(map_both))
    # remove -inf, replacing

    #logmap_both = 

    fd = open('map_out.pickle','w')
    cPickle.dump((map_0, map_1, map_both), fd)
    fd.close()

    fd = open('map_out.pickle','r')
    (map_0, map_1, map_both) = cPickle.load(fd)
    f.close()

    map_both1 = map_both + 1
    #pyplot.pcolor(map_both1, cmap=pyplot.cm.hot,
    pyplot.imshow(map_both1, cmap=pyplot.cm.hot,
                  norm = matplotlib.colors.LogNorm(),
                  origin='lower')
    #pyplot.xlabel("xlabel")