#!/usr/bin/env python

# Jacob Joseph
# 6 Oct 2011

# Consider a variety of schemes for differentiating and classifying
# clusters or domains based upon the mutual information of both

import os, cPickle
import numpy
import time
import matplotlib
from matplotlib import pyplot
from DurandDB import pfamq, familyq, blastq
from JJcluster import cluster_sql, describe
#from information import mutualinfo
from mi_plot import mi_plot

#class mi_factor(mutualinfo):
class mi_factor(mi_plot):

    # Per cluster:
    # -------------------
    # cluster size
    # cluster maximum
    # num distinct domains
    # min, mean, median, max domain MI
    # min, mean, median, max domain pSMI
    # max domain MI - pSMI of that domain
    # min, mean, median, max: domain MI - pSMI
    # cluster maximum - best MI
    # cluster maximum - best pSMI


    def __init__(self, clustering, annotation,
                 cluster_run_id = None, clustering_type=None):
        #mutualinfo.__init__(self, clustering, annotation)
        mi_plot.__init__(self, clustering, annotation)

        self.bq = blastq.blastq()
        
        self.factors = [
            ('cluster_size', self.clfn_cluster_size),    # cluster size
            #('cluster_max', self.clfn_max_cluster_mi),    # cluster maximum
            ('num_domains', self.clfn_num_domains),       # num distinct domains
            
            ('mean_num_domains', self.clfn_mean_num_domains),  # num distinct domains/sequence
            ('mean_seq_length', self.clfn_mean_sequence_length),

            # min, mean, median, max, and sum of domain MI
            #('min_domain_mi', self.clfn_min_domains_mi), 
            #('mean_domain_mi', self.clfn_mean_domains_mi),
            #('med_domain_mi', self.clfn_med_domains_mi),
            ('max_domain_mi', self.clfn_max_domains_mi),
            #('sum_domain_mi', self.clfn_sum_domains_mi),

            # min, mean, median, max, sum domain pSMI
            #('min_domain_psmi', self.clfn_min_domain_psmi), 
            #('mean_domain_psmi', self.clfn_mean_domain_psmi),
            ('med_domain_psmi', self.clfn_med_domain_psmi),
            #('max_domain_psmi', self.clfn_max_domain_psmi),
            #('sum_domain_psmi', self.clfn_sum_domain_psmi),

            # min, mean, median, max, sum:  domain MI - pSMI
            #('min_domain_mi_psmi', self.clfn_min_domain_mi_psmi), 
            #('mean_domain_mi_psmi', self.clfn_mean_domain_mi_psmi),
            ('med_domain_mi_psmi', self.clfn_med_domain_mi_psmi),
            #('max_domain_mi_psmi', self.clfn_max_domain_mi_psmi),
            #('sum_domain_mi_psmi', self.clfn_sum_domain_mi_psmi),

            # min, mean, median, max, sum: Cluster maximum - domain pSMI in that cluster
            #('min_clmax_less_psmi', self.clfn_min_clmax_less_psmi), 
            #('mean_clmax_less_psmi', self.clfn_mean_clmax_less_psmi),
            ('med_clmax_less_psmi', self.clfn_med_clmax_less_psmi),
            #('max_clmax_less_psmi', self.clfn_max_clmax_less_psmi),
            #('sum_clmax_less_psmi', self.clfn_sum_clmax_less_psmi),

            # min, mean, median, max, sum: Cluster maximum - domain MI
            #('min_clmax_less_mi', self.clfn_min_clmax_less_mi),
            #('mean_clmax_less_mi', self.clfn_mean_clmax_less_mi),
            ('med_clmax_less_mi', self.clfn_med_clmax_less_mi),
            #('max_clmax_less_mi', self.clfn_max_clmax_less_mi),
            #('sum_clmax_less_mi', self.clfn_sum_clmax_less_mi),
            
            # max domain MI - pSMI of that domain?
            ]               

        # Use additional statistics if a cluster run was provided
        if False and cluster_run_id is not None and clustering_type is not None:

            self.cluster_desc = describe.describe(cluster_run_id=cluster_run_id,
                                                  clustering_type=clustering_type,
                                                  cacheq=True)

            self.factors.extend([
                # Sequence similarity of the cluster
                ('min_bit_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='min', stype='bit_score')),
                ('max_bit_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='max', stype='bit_score')),
                ('mean_bit_score',lambda cluster: self.clfn_cl_stat(cluster, statkey='mean', stype='bit_score')),
                ('stdev_bit_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='mean', stype='bit_score')),
                ('density_bit_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='density', stype='bit_score')),
                ('frac_edges_bit_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='frac_edges', stype='bit_score')),

                # Score of the clustering (usually NC, but could be anything)
                ('min_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='min')),
                ('max_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='max')),
                ('mean_score',lambda cluster: self.clfn_cl_stat(cluster, statkey='mean')),
                ('stdev_score', lambda cluster: self.clfn_cl_stat(cluster, statkey='mean')),
                ('density', lambda cluster: self.clfn_cl_stat(cluster, statkey='density')),
                ('frac_edges', lambda cluster: self.clfn_cl_stat(cluster, statkey='frac_edges')),
                ])


        self.fact_index = dict( (f[0], i) for i,f in enumerate(self.factors))

        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_109.pickle"
        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_492_109.pickle"
        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_493_112.pickle"
        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_494_115.pickle"
        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_114.pickle"
        mi_pickle = "$HOME/tmp/pickles/pca_analysis_117.pickle"
        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_111.pickle"
        #mi_pickle = "$HOME/tmp/pickles/pca_analysis_116.pickle"
        #self.fetch_mi_matricies()
        #self.fetch_mi_matricies_pickle(mi_pickle)
        self.fetch_mi_matricies_unpickle(mi_pickle)
        
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_109.pickle"
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_492_109.pickle"
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_493_112.pickle"
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_494_115.pickle"
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_114.pickle"
        factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_117.pickle"
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_111.pickle"
        #factor_pickle = "$HOME/tmp/pickles/pca_calculate_factor_matrix_116.pickle"
        #self.calculate_factor_matrix()
        #self.calculate_factor_matrix_pickle(factor_pickle)
        self.calculate_factor_matrix_unpickle(factor_pickle)
        
        #self.calculate_pca()
        
    def clfn_cluster_size(self, cluster):
        return len(self.cluster_to_seq[cluster])

    def clfn_num_domains(self, cluster):
        return len(self.annotations_in_cluster(cluster))

    def clfn_mean_num_domains(self, cluster):
        return ( float(len(self.annotations_in_cluster(cluster))) /
                 len(self.cluster_to_seq[cluster]))

    def clfn_mean_sequence_length(self, cluster):
        # Cache successive calls for the same cluster
        if ('_cache_mean_sequence_length' in self.__dict__
            and self._cache_mean_sequence_length[0] == cluster):
            return self._cache_mean_sequence_length[1]

        seqs = tuple(self.cluster_to_seq[cluster])

        q = """SELECT AVG(length)
        FROM prot_seq_str
        JOIN prot_seq USING (seq_str_id)
        WHERE seq_id in %(seqs)s"""

        avg_length = float(self.bq.dbw.fetchsingle(q, locals()))
        self._cache_mean_sequence_length = (cluster, avg_length)
        return avg_length


    def clfn_cl_stat(self, cluster, statkey, stype=None):
        # Cache successive calls for the same cluster
        if ('_cache_cl_stat' in self.__dict__
            and self._cache_cl_stat[0] == (cluster, stype)):
            cl_stat = self._cache_cl_stat[1]
        else:
            cl_stat = self.cluster_desc.cluster_stats( cluster, stype=stype)
            self._cache_cl_stat = ((cluster, stype), cl_stat)

        # This function returns negative values when missing.  Replace
        # these with numpy.nan.
        return cl_stat[statkey] if cl_stat[statkey] >= 0 else numpy.nan
        

    def cluster_domain_indicies(self, cluster):
        # Cache successive calls for the same cluster
        if ('_cache_cluster_domain_indicies' in self.__dict__
            and self._cache_cluster_domain_indicies[0] == cluster):
            return self._cache_cluster_domain_indicies[1]

        
        domains = self.annotations_in_cluster(cluster)
        domains_j = numpy.array(
            [self.ann_index[domain] for domain in domains],
            dtype = numpy.int)

        #print cluster, domains, domains_j

        self._cache_cluster_domain_indicies = (cluster, domains_j)
            
        return domains_j
        
    def clfn_min_domains_mi(self, cluster):
        a = self.domain_mi_array[ self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.min(a)

    def clfn_mean_domains_mi(self, cluster):
        a = self.domain_mi_array[ self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.mean(a)

    def clfn_med_domains_mi(self, cluster):
        a = self.domain_mi_array[ self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.median(a)

    def clfn_max_domains_mi(self, cluster):
        a = self.domain_mi_array[ self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.max(a)

    def clfn_sum_domains_mi(self, cluster):
        a = self.domain_mi_array[ self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.sum(a)

    def clfn_min_domain_psmi(self, cluster):
        i = self.cl_index[cluster]
        a = self.pos_mi_matrix[i][self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.min(a)

    def clfn_mean_domain_psmi(self, cluster):
        i = self.cl_index[cluster]
        a = self.pos_mi_matrix[i][self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.mean(a)

    def clfn_med_domain_psmi(self, cluster):
        i = self.cl_index[cluster]
        a = self.pos_mi_matrix[i][self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.median(a)

    def clfn_max_domain_psmi(self, cluster):
        i = self.cl_index[cluster]
        a = self.pos_mi_matrix[i][self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.max(a)

    def clfn_sum_domain_psmi(self, cluster):
        i = self.cl_index[cluster]
        a = self.pos_mi_matrix[i][self.cluster_domain_indicies(cluster)]
        if len(a) == 0: return numpy.nan
        return numpy.sum(a)

    def clfn_min_domain_mi_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.min( self.domain_mi_array[domains_j] -
                          self.pos_mi_matrix[i][domains_j] )
                                               
    def clfn_mean_domain_mi_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.mean( self.domain_mi_array[domains_j] -
                           self.pos_mi_matrix[i][domains_j] )

    def clfn_med_domain_mi_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.median( self.domain_mi_array[domains_j] -
                             self.pos_mi_matrix[i][domains_j] )

    def clfn_max_domain_mi_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.max( self.domain_mi_array[domains_j] -
                          self.pos_mi_matrix[i][domains_j] )

    def clfn_sum_domain_mi_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.sum( self.domain_mi_array[domains_j] -
                          self.pos_mi_matrix[i][domains_j] )

    def clfn_max_cluster_mi(self, cluster):
        # Cache successive calls for the same cluster
        if ('_cache_max_cluster_mi' in self.__dict__
            and self._cache_max_cluster_mi[0] == cluster):
            return self._cache_max_cluster_mi[1]

        max_mi = self.max_cluster_mi( cluster)
        self._cache_max_cluster_mi = (cluster, max_mi)
        return max_mi

    def clfn_min_clmax_less_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.min( self.clfn_max_cluster_mi(cluster) -
                          self.pos_mi_matrix[i][domains_j])

    def clfn_mean_clmax_less_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.mean( self.clfn_max_cluster_mi(cluster) -
                          self.pos_mi_matrix[i][domains_j])

    def clfn_med_clmax_less_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.median( self.clfn_max_cluster_mi(cluster) -
                             self.pos_mi_matrix[i][domains_j])

    def clfn_max_clmax_less_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.max( self.clfn_max_cluster_mi(cluster) -
                          self.pos_mi_matrix[i][domains_j])

    def clfn_sum_clmax_less_psmi(self, cluster):
        i = self.cl_index[cluster]
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.sum( self.clfn_max_cluster_mi(cluster) -
                          self.pos_mi_matrix[i][domains_j])

    def clfn_min_clmax_less_mi(self, cluster):
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.min( self.clfn_max_cluster_mi(cluster) -
                          self.domain_mi_array[domains_j])

    def clfn_mean_clmax_less_mi(self, cluster):
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.mean( self.clfn_max_cluster_mi(cluster) -
                          self.domain_mi_array[domains_j])

    def clfn_med_clmax_less_mi(self, cluster):
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.median( self.clfn_max_cluster_mi(cluster) -
                             self.domain_mi_array[domains_j])

    def clfn_max_clmax_less_mi(self, cluster):
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.max( self.clfn_max_cluster_mi(cluster) -
                          self.domain_mi_array[domains_j])

    def clfn_sum_clmax_less_mi(self, cluster):
        domains_j = self.cluster_domain_indicies(cluster)
        if len(domains_j) == 0: return numpy.nan
        return numpy.sum( self.clfn_max_cluster_mi(cluster) -
                          self.domain_mi_array[domains_j])


    def fetch_mi_matricies(self):
        (self.domain_mi_array,
         self.pos_mi_matrix,
         self.neg_mi_matrix,
         self.cl_index,
         self.ann_index ) = self.mutual_information_specific_array()

        self.cl_index_inv = dict( (v,k) for (k,v) in self.cl_index.items())

        return

    def fetch_mi_matricies_unpickle(self, fname='$HOME/tmp/pickles/pca_analysis.pickle'):

        (self.domain_mi_array,
         self.pos_mi_matrix,
         self.neg_mi_matrix,
         self.cl_index,
         self.ann_index) = cPickle.load( open( os.path.expandvars(fname)))

        self.cl_index_inv = dict( (v,k) for (k,v) in self.cl_index.items())

        return

    def fetch_mi_matricies_pickle(self, fname='$HOME/tmp/pickles/pca_analysis.pickle'):

        fd = open( os.path.expandvars(fname), 'wb')

        cPickle.dump( (self.domain_mi_array,
                       self.pos_mi_matrix,
                       self.neg_mi_matrix,
                       self.cl_index,
                       self.ann_index),
                      fd, protocol=2)
        fd.close()
        
        return


    def calculate_factor_matrix(self):
        self.cluster_fact = numpy.empty( (len(self.cl_index), len(self.fact_index)),
                                         dtype=numpy.float64)

        for cluster, i in self.cl_index.items():
            for fact, fact_fn in self.factors:
                j = self.fact_index[fact]

                self.cluster_fact[i][j] = fact_fn( cluster)
        return

    def write_factor_matrix_orange(self, filename):
        """Output the factor matrix such that it may be read with
        Orage.data.Table"""

        fd = open(filename, 'w')

        # header
        s_name = ""
        s_type = ""
        s_class = ""
        for factor, i in sorted( self.fact_index.items(), key=lambda a: a[1]):

            if i > 0:
                for s in (s_name, s_type, s_class):
                    s += "\t"

            s_name += "%s" % factor
            s_type += "c"   # continuous variable 'c', or discrete 'd'
            s_class += ""   # one will be labelled "class"

        s_name += "\tcluster_id\n"
        s_type += "\td\n"
        s_class += "\tclass\n"

        for s in (s_name, s_type, s_class):
            fd.write(s)

        assert False, "unimplemented"


    def calculate_factor_matrix_pickle(self, fname='$HOME/tmp/pickles/pca_calculate_factor_matrix.pickle'):
        fd = open(os.path.expandvars(fname), 'wb')

        cPickle.dump( self.cluster_fact, fd, protocol=2)

        fd.close()
        return

    def calculate_factor_matrix_unpickle(self, fname='$HOME/tmp/pickles/pca_calculate_factor_matrix.pickle'):
        self.cluster_fact = cPickle.load( open(os.path.expandvars(fname)))

        return

    def calculate_pca(self, sample_fraction=None):
        
        # Including NaNs in the input to the numpy PCA sometimes
        # doesn't converge.

        #print "cluster_fact length:", len(self.cluster_fact)

        factor_matrix = self.cluster_fact

        # HACK 1: Remove clusters which have no domains.
        nonzero_clusters = factor_matrix[:, self.fact_index['num_domains']].nonzero()
        factor_matrix = factor_matrix[nonzero_clusters]
        
        #print "factor_matrix length:", len(factor_matrix)

        # HACK: Remove clusters with nan cluster_statistics
        for fact in ('density', 'density_bit_score', 'frac_edges_bit_score'):
            if not fact in self.fact_index: continue
                
            arenan = numpy.isnan(factor_matrix[:, self.fact_index[fact]])
            nonzero_clusters = (~arenan).nonzero()[0]
            factor_matrix = factor_matrix[nonzero_clusters]

        #print "factor_matrix length:", len(factor_matrix)

        # HACK: Set Nan values to 0
        #factors = self.cluster_fact[:]  # copy
        #nanarr = numpy.isnan(factors)
        #factors[nanarr] = 0

        # Sample clusters; e.g., for robustness tests
        if sample_fraction is not None:
            assert 0 < sample_fraction <= 1, "sample fraction (%g) should be > 0, <= 1" % sample_fraction
            
            indicies = numpy.random.permutation( factor_matrix.shape[0])
            indicies = indicies[:int(len(indicies) * sample_fraction)]

            # reordering the matrix shouldn't change the PCA result,
            # but I imagine it's nicer to slice in contiguous order.
            indicies.sort()
            factor_matrix = factor_matrix[indicies]

        self.pca = matplotlib.mlab.PCA(factor_matrix)

        # Don't use self.pca.Y, as the PCA may have been done with
        # truncated data, as to avoid SVD non-convergence, or
        # reordered, above.  Re-project the original data.
        self.projection = self.pca.project(self.cluster_fact)


    def pca_component_stability(self, nsamples, sample_fraction, ncomponents=5):

        Wt_array = numpy.zeros( (nsamples, ncomponents, len(self.fact_index)))
        frac_array = numpy.zeros( (nsamples, len(self.fact_index)))

        for i in range(nsamples):
            self.calculate_pca( sample_fraction)

            Wt_array[i] = self.pca.Wt[:ncomponents, :]     # weight of factors in component
            frac_array[i] = self.pca.fracs    # fraction of variance explained by component

        # restore the class to the full PCA calculation
        self.calculate_pca()

        return (Wt_array, frac_array)


    def plot_pca_component_scree(self, ncomponents=None,
                                 title_prefix=None, filename=None,
                                 dpi=None, nsamples=None, sample_fraction=None,
                                 subplots_adjust=None,
                                 draw_title=True):
        """A 'scree' plot of the variances explained by each component"""

        pyplot.figure()

        if draw_title:
            pyplot.title("%sPCA Component 'scree' plot: %s" % (
                "" if title_prefix is None else title_prefix+"\n",
                "%d samples, %g proportion" % (nsamples, sample_fraction) if nsamples is not None else ""))

        if ncomponents is None:
            # show components with variance >= 0.01
            ncomponents = (self.pca.fracs < 0.01).nonzero()[0][0]

        # sample, keeping components that explained more than 0.01
        # variance in the full set
        if nsamples is not None and sample_fraction is not None:
            Wt_array, frac_array = self.pca_component_stability( nsamples, sample_fraction,
                                                                 ncomponents = ncomponents)
            fracs_mean = numpy.mean( frac_array, axis=0)
            fracs_std = numpy.std( frac_array, axis=0)
        else:
            fracs_mean = self.pca.fracs[:ncomponents + 1]
            fracs_std = None

        pyplot.errorbar( range(len(fracs_mean)), fracs_mean,
                         marker = '',
                         yerr = fracs_std,
                         ecolor='black',
                         label='Component',
                         zorder=12,
                         markeredgewidth=0)

        pyplot.plot(numpy.cumsum(fracs_mean),
                    label="Cumulative contribution",
                    zorder=11)

        pyplot.ylim(-0.01,1.01)

        pyplot.xlabel("Principal component")
        pyplot.ylabel("Fraction of variance")
        
        leg = pyplot.legend(fancybox=True)
        leg.get_frame().set_alpha(0.6)
        leg.get_frame().set_facecolor('lightgrey')
        leg.set_zorder(14)

        if subplots_adjust is not None:
            pyplot.subplots_adjust(**subplots_adjust)
            
        pyplot.grid(color='grey',zorder=10)

        if filename is not None:
            pyplot.savefig( 'figures/%s' % filename, dpi=dpi)
            pyplot.close()
        else:
            pyplot.show()
                     
        return

        
    def plot_pca_factor_stability(self, nsamples, sample_fraction,
                                  comp_x = 0, comp_y = 1,
                                  title_prefix = None,
                                  filename = None,
                                  dpi = None,
                                  xlim = None, ylim = None,
                                  subplots_adjust=None,
                                  draw_title=True):
        pyplot.figure()

        if draw_title:
            pyplot.title("%sPCA factor stability: %d samples, %g proportion" % (
                "" if title_prefix is None else title_prefix+"\n",
                nsamples, sample_fraction))
        
        Wt_array, frac_array = self.pca_component_stability( nsamples, sample_fraction,
                                                             ncomponents=max(comp_x,comp_y)+1)

        # avg position over all samples
        Wt_avg = numpy.mean( Wt_array, axis=0)
        Wt_std = numpy.std( Wt_array, axis=0)
        frac_avg = numpy.mean( frac_array, axis=0)
        frac_std = numpy.std( frac_array, axis=0)
        
        # plot average position and stdev error bars
        pyplot.scatter( Wt_avg[comp_x],
                        Wt_avg[comp_y],
                        marker = 'o',
                        zorder=11
                        )
        pyplot.errorbar( Wt_avg[comp_x], Wt_avg[comp_y],
                         xerr=Wt_std[comp_x], yerr=Wt_std[comp_y],
                         ecolor='grey',
                         fmt=None,
                         zorder=12
                         )
        
        #pyplot.errorbar( Wt_avg[comp_x], Wt_avg[comp_y], , ecolor='grey', lw=0)

        for factor, i in self.fact_index.items():
            pyplot.text( Wt_avg[comp_x][i], Wt_avg[comp_y][i],
                         #factor,
                         reduce(lambda a,b: a+" "+b, factor.split('_')),
                         horizontalalignment='left',verticalalignment='top',
                         zorder=13,
                         fontsize=8)
            
        #pyplot.xlabel("Component %d. mean proportion: %6.4f (stdev: %6.4f)"  % (
        #    comp_x, frac_avg[comp_x], frac_std[comp_x]))
        #pyplot.ylabel("Component %d. mean proportion: %6.4f (stdev: %6.4f)"  % (
        #    comp_y, frac_avg[comp_y], frac_std[comp_y]))
        pyplot.xlabel("Component %d" % comp_x)
        pyplot.ylabel("Component %d" % comp_y)

        pyplot.grid(color='grey', alpha=0.5, zorder=10)

        pyplot.axis('tight')

        if subplots_adjust is not None:
            pyplot.subplots_adjust(**subplots_adjust)

        if xlim is not None:
            pyplot.xlim( xlim)

        if ylim is not None:
            pyplot.ylim( ylim)

        if filename is not None:
            pyplot.savefig( 'figures/%s' % filename, dpi=dpi)
            pyplot.close()
        else:
            pyplot.show()

        return (Wt_array, frac_array)
        
                                  
    def print_pca(self):
        m = "PCA data\n-----------------------------------\n"

        m += "# clusters: %d\n" % len(self.cluster_to_seq)
        m += "# clusters used for PCA: %d\n" % self.pca.numrows
        m += "# factors: %d\n" % self.pca.numcols

        m += "\nPrinciple Component Factor Fractions\n"
        m += "Factor                  0       1       2       3       4       5\n"
        m += "----------------------------------------------------------------------\n"
        for factor, i in sorted(self.fact_index.items(), key=lambda a: a[1]):
            m += "%20s % 6.4f % 6.4f % 6.4f % 6.4f % 6.4f % 6.4f\n" % (
                factor,
                self.pca.Wt[0][i],
                self.pca.Wt[1][i],
                self.pca.Wt[2][i],
                self.pca.Wt[3][i],
                self.pca.Wt[4][i],
                self.pca.Wt[5][i])

        m += "\n"
        m += "%20s % 6.4f % 6.4f % 6.4f % 6.4f % 6.4f % 6.4f\n" % (
            "Variance:",
            self.pca.fracs[0],
            self.pca.fracs[1],
            self.pca.fracs[2],
            self.pca.fracs[3],
            self.pca.fracs[4],
            self.pca.fracs[5])

        m += "\n" 
        m += "# Principle components: %d\n" % len(self.pca.fracs)

        return m

    def plot_pca_factors(self, num=3):
        pyplot.figure()
        for comp in range(num):
            pyplot.plot(self.pca.Wt[comp],
                        label="%d (%6.4f)" % (comp, self.pca.fracs[comp]))
            
        xlabels = sorted(self.fact_index, key=lambda a: a[1])
        pyplot.xticks( range(len(xlabels)), xlabels, rotation=-90)
        
        pyplot.legend()
        pyplot.subplots_adjust(bottom=0.3)
        pyplot.grid()
        return

    def plot_pca_data_scatter(self, comp_x=0, comp_y=1,
                              title_prefix = None,
                              filename = None,
                              dpi = None,
                              draw_title=True,
                              subplots_adjust=None,
                              cluster_categories = None,
                              draw_legend = True,
                              xlim = None,
                              ylim = None,
                              legend_loc = None,
                              color_by_factor = None,
                              draw_colorbar = True):
        fig = pyplot.figure()

        if draw_title:
            pyplot.title("%sPCA component scatter" % (
                "" if title_prefix is None else title_prefix+"\n"))

        # Don't use self.pca.Y, as the PCA may have been done with
        # truncated data, as to avoid SVD non-convergence.
        #projection = self.pca.project(self.cluster_fact)

        X = self.projection[:,comp_x]
        Y = self.projection[:,comp_y]

        if color_by_factor is not None:
            values = self.cluster_fact[:, self.fact_index[ color_by_factor]]
            norm = matplotlib.colors.LogNorm()
            normvalues = norm( values)
            colors = matplotlib.cm.hot( normvalues)

            if color_by_factor == 'num_domains':
                vmin = 1
                vmax = 10 ** int(numpy.log10( numpy.max(values)) + 1)
                norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
                cb_format = "%d"
                cb_ticks = None

            elif color_by_factor in ('cluster_max'):
                vmin = 1
                vmax = 10 ** int(numpy.log10( numpy.max(values)) + 1)
                norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
                cb_format = "%3.1f"
                cb_ticks = None

            elif color_by_factor in ('cluster_size'):
                vmin = 1
                vmax = 10 ** int(numpy.log10( numpy.max(values)) + 1)
                norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
                cb_format = "%d"
                cb_ticks = None

            elif color_by_factor == 'mean_num_domains':
                vmin = 0.1
                vmax = 10 ** int(numpy.log10( numpy.max(values)) + 1)
                #print vmin, vmax, numpy.max(values)
                norm = matplotlib.colors.LogNorm(vmin=vmin, vmax=vmax)
                cb_format = "%3.1f"
                cb_ticks = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 3, 4, 5, vmax]

            elif color_by_factor in ('med_clmax_less_psmi', 'med_domain_psmi'):
                norm = matplotlib.colors.LogNorm()
                cb_format = None
                cb_ticks = None

            else:
                norm = matplotlib.colors.Normalize()
                cb_format = None
                cb_ticks = None

            pyplot.scatter(X, Y, zorder=11, 
                           c=values,
                           cmap=pyplot.cm.hot,
                           norm = norm,
                           alpha=0.7,
                           edgecolor='none',
                           #facecolor=
                           )
        else:
            pyplot.scatter(X, Y, zorder=11)

        # Overlay cluster annotations
        if cluster_categories is not None:
            for (category, color, marker, clusters) in cluster_categories:
                
                cluster_inds = [self.cl_index[cluster] for cluster in clusters]

                X = self.projection[ cluster_inds, comp_x ]
                Y = self.projection[ cluster_inds, comp_y ]
                
                #color = matplotlib.cm.jet( float(i) / len(cluster_categories))
                

                pyplot.scatter(X, Y, label = category, 
                               marker = marker,
                               # highlight the first cluster in a category with a larger box
                               #s = [160] + [80]*(len(clusters)-1),
                               s = 100,
                               alpha=0.8,
                               #linestyle = ['solid'] + ['dotted']*(len(clusters)-1),
                               #linewidth = [3] + [1]*(len(clusters)-1),
                               linewidth = 1,
                               #facecolor = 'none',
                               facecolor = [color] + ['none']*(len(clusters)-1),
                               edgecolor = color,
                               zorder = 12)

                if draw_legend:
                    leg = pyplot.legend(fancybox=True, loc=legend_loc, ncol=2) # upper left
                    leg.get_frame().set_alpha(0.6)
                    leg.get_frame().set_facecolor('lightgrey')
                    leg.set_zorder(13)

        #pyplot.xlabel("Component %d (%6.4f)" % (comp_x, self.pca.fracs[comp_x]))
        #pyplot.ylabel("Component %d (%6.4f)" % (comp_y, self.pca.fracs[comp_y]))
        pyplot.xlabel("Component %d" % (comp_x))
        pyplot.ylabel("Component %d" % (comp_y))

        pyplot.axis('tight')
        fig.get_axes()[0].set_axis_bgcolor('grey')
        pyplot.grid(color='white', zorder=10)

        if xlim is not None:
            pyplot.xlim( xlim)

        if ylim is not None:
            pyplot.ylim( ylim)

        if subplots_adjust is not None:
            pyplot.subplots_adjust(**subplots_adjust)

        if draw_colorbar and color_by_factor is not None:
           pyplot.colorbar(fraction=0.05, pad=0.01, format=cb_format, ticks=cb_ticks)

        if filename is not None:
            pyplot.savefig( 'figures/%s' % filename, dpi=dpi)
            pyplot.close()
        else:
            pyplot.show()

        return

    def plot_factor_scatter(self, fact_x=None, fact_y=None,
                            title_prefix = None,
                            filename = None,
                            dpi = None):
        fig = pyplot.figure()
        pyplot.title("%sFactor scatterplot" % (
            "" if title_prefix is None else title_prefix+"\n"))

        X = self.cluster_fact[:, self.fact_index[fact_x]]
        Y = self.cluster_fact[:, self.fact_index[fact_y]]

        pyplot.scatter(X, Y)

        pyplot.xlabel("Factor: %s" % fact_x)
        pyplot.ylabel("Factor: %s" % fact_y)
        
        pyplot.axis('tight')
        fig.get_axes()[0].set_axis_bgcolor('lightgrey')
        pyplot.grid(color='grey')

        pyplot.subplots_adjust(left=0.08, bottom=0.07, right=0.97, top=0.9)

        if filename is not None:
            pyplot.savefig( 'figures/%s' % filename, dpi=dpi)
            pyplot.close()
        else:
            pyplot.show()

        return

    def plot_pca_data_heatmap(self, comp_x=0, comp_y=1,
                              title_prefix = None,
                              filename=None,
                              dpi = None,
                              hres=1000, vres=1000,
                              draw_title=True,
                              subplots_adjust=None
                              ):

        fig = pyplot.figure()

        if draw_title:
            pyplot.title("%sPCA component heatmap" % (
                "" if title_prefix is None else title_prefix+"\n"))

        # find the range of values in each axis
        X = self.projection[:,comp_x]
        Y = self.projection[:,comp_y]

        x_min = numpy.nanmin(X)
        x_max = numpy.nanmax(X)
        y_min = numpy.nanmin(Y)
        y_max = numpy.nanmax(Y)

        x_scale = float(hres - 1) / (x_max - x_min)
        y_scale = float(vres - 1) / (y_max - y_min)

        xy = numpy.zeros([vres, hres], dtype=numpy.int32)

        #FIXME.  This ought to be able to be done far more efficiently

        # Also, it considers only clusters with a non-nan PCA coordinates
        
        skipped = 0
        for i,(x,y) in enumerate(self.projection[:,[comp_x,comp_y]]):

            if numpy.isnan(x) or numpy.isnan(y):
                skipped += 1
                continue

            #print i,x,y,x_min,x_max,y_min,y_max,x_scale,y_scale
            
            # this always rounds down
            x_trans = int(x_scale * (x - x_min))
            y_trans = int(y_scale * (y - y_min))

            #print x_trans, y_trans
            # matplotlib.imshow uses opposite indicing from what is natural here
            xy[y_trans, x_trans] += 1

            #if x < 5 and y < -40:
            #    print i, x, y
            #    print self.projection[i,comp_x], self.projection[i, comp_y]
            #    print x_trans, y_trans
            #    print x_scale, y_scale
            #    print x_min, y_min
            #    print xy[x_trans, y_trans]

        #xy[ int(x_scale * (6 - x_min)), int(y_scale * (-36 - y_min))] = 100
        #print int(x_scale * (6 - x_min)), int(y_scale * (-36 - y_min))
        #xy[0,0] = 100
        #xy[10,0] = 100
        #xy[20,20] = 100
        #xy[5.5,5.5] = 100
        pyplot.imshow(xy, cmap=pyplot.cm.hot,
                      origin='lower',
                      norm = matplotlib.colors.LogNorm(),
                      interpolation='nearest',
                      zorder=11
                      )
        
        # fix axis limits perfectly
        pyplot.axis('image')

        # 10 ticks
        x_ticks = numpy.int32(numpy.linspace( x_min, x_max, 10))
        y_ticks = numpy.int32(numpy.linspace( y_min, y_max, 10))
        
        #x_ticks = numpy.arange( x_min, x_max, (x_max - x_min) / 10, dtype=numpy.int32)
        #y_ticks = numpy.arange( y_min, y_max, (y_max - y_min) / 10, dtype=numpy.int32)

        #print "x_min =", x_min
        #print "x_max =", x_max
        #print "x_scale =", x_scale
        #print "skipped:", skipped
        #print "x_ticks =", x_ticks

        # this is probably slighlty inaccruate
        pyplot.xticks( numpy.int32(x_scale * (x_ticks - x_min)), x_ticks)
        pyplot.yticks( numpy.int32(y_scale * (y_ticks - y_min)), y_ticks)

        fig.get_axes()[0].set_axis_bgcolor('lightgrey')
        pyplot.grid(color='grey', zorder=10)

        #pyplot.xlabel("Component %d (%6.4f)" % (comp_x, self.pca.fracs[comp_x]))
        #pyplot.ylabel("Component %d (%6.4f)" % (comp_y, self.pca.fracs[comp_y]))
        pyplot.xlabel("Component %d" % comp_x)
        pyplot.ylabel("Component %d" % comp_y)


        #pyplot.xlim(100,250)
        #pyplot.ylim(0,150)

        if subplots_adjust is not None:
            pyplot.subplots_adjust(**subplots_adjust)

        pyplot.colorbar(fraction=0.05, pad=0.01)

        if filename is not None:
            pyplot.savefig( 'figures/%s' % filename, dpi=dpi)
            pyplot.close()
        else:
            pyplot.show()

        return xy

    def plot_factor_heatmap(self, fact_x=None, fact_y=None,
                              title_prefix = None,
                              filename=None,
                              dpi = None,
                              hres=1000, vres=1000):

        fig = pyplot.figure()
        pyplot.title("%sFactor heatmap" % (
            "" if title_prefix is None else title_prefix+"\n"))

        facti_x = self.fact_index[fact_x]
        facti_y = self.fact_index[fact_y]

        # find the range of values in each axis
        X = self.cluster_fact[:, facti_x]
        Y = self.cluster_fact[:, facti_y]

        x_min = numpy.nanmin(X)
        x_max = numpy.nanmax(X)
        y_min = numpy.nanmin(Y)
        y_max = numpy.nanmax(Y)

        x_scale = float(hres - 1) / (x_max - x_min)
        y_scale = float(vres - 1) / (y_max - y_min)

        xy = numpy.zeros([vres, hres], dtype=numpy.int32)

        #FIXME.  This ought to be able to be done far more
        #efficiently.  Further, much can probably be handled with
        #imshow's extents parameter

        # Also, it considers only clusters with a non-nan PCA coordinates
        # print some information on the plot about how many were skipped

        skipped = 0
        for i,(x,y) in enumerate(self.cluster_fact[:,[facti_x, facti_y]]):

            # <0 for missing values of scores
            if numpy.isnan(x) or numpy.isnan(y) or x<0 or y<0:
                skipped += 1
                continue

            #print i,x,y,x_min,x_max,y_min,y_max,x_scale,y_scale
            
            # this always rounds down
            x_trans = int(x_scale * (x - x_min))
            y_trans = int(y_scale * (y - y_min))

            #print x, y, x_trans, y_trans

            #xy[x_trans, y_trans] += 1
            xy[y_trans, x_trans] += 1

        
        pyplot.imshow(xy, cmap=pyplot.cm.hot,
                      origin='lower',
                      norm = matplotlib.colors.LogNorm(),
                      interpolation='nearest'
                      )


        # fit axis limits 
        pyplot.axis('image')

        # 10 ticks
        x_ticks = numpy.linspace( x_min, x_max, 10)
        y_ticks = numpy.linspace( y_min, y_max, 10)
        
        #x_ticks = numpy.arange( x_min, x_max, (x_max - x_min) / 10, dtype=numpy.int32)
        #y_ticks = numpy.arange( y_min, y_max, (y_max - y_min) / 10, dtype=numpy.int32)

        #print "x_min =", x_min
        #print "x_max =", x_max
        #print "x_scale =", x_scale
        #print "skipped:", skipped
        #print "x_ticks =", x_ticks

        # this is probably slighlty inaccruate
        pyplot.xticks( numpy.int32(x_scale * (x_ticks - x_min)),
                       ["%4.3g" % f for f in x_ticks])
        pyplot.yticks( numpy.int32(y_scale * (y_ticks - y_min)),
                       ["%4.3g" % f for f in y_ticks])

        fig.get_axes()[0].set_axis_bgcolor('grey')
        pyplot.grid(color='white')

        pyplot.xlabel("Factor: %s" % fact_x)
        pyplot.ylabel("Factor: %s" % fact_y)

        #pyplot.xlim(100,250)
        #pyplot.ylim(0,150)

        pyplot.subplots_adjust( bottom=0.05, left=0.01, top=0.90, right=0.95)
        pyplot.colorbar(fraction=0.05, pad=0.01)

        if filename is not None:
            pyplot.savefig( 'figures/%s' % filename, dpi=dpi, transparent=False)
            pyplot.close()
        else:
            pyplot.show()

        return X, Y, xy


    def print_cluster_data(self, cluster_id):

        m = "Cluster: %s\n" % cluster_id

        m += "PCA Factors: %s\n" % self.projection[self.cl_index[cluster_id]]

        m += "# sequences: %d\n" % len( self.cluster_to_seq[cluster_id])

        annotations = self.annotations_in_cluster(cluster_id)
        m += "Domains: (%d total)\n" % len(annotations)
        
        m += "Domain    #seqs #seqsall #clust TotSize    Entropy         MI Name\n"
        m += "------------------------------------------------------------------\n"

        for domain in sorted(annotations):
            joint_cnt = self.joint_cluster_ann_count(domain)
            total_cluster_cnt = sum((len(self.cluster_to_seq[cluster])
                                     for cluster in joint_cnt))
            
            m += "%-10s %4d%9d%7d%8d %0.4e %0.4e %s\n" % (
                domain,
                joint_cnt[cluster_id],
                len(self.annotation_to_seq[domain]),
                len(joint_cnt),
                total_cluster_cnt,
                self.annotation_entropy(domain),
                self.mutual_information(domain),
                pq.lookup_domain_name(domain))
            
        return m
        

    def constrain_clusters( self, constraints):
        """Input.  A list of (component #, range_min, range_max)
        constraints.  Either of the ranges can be None."""

        bitmap = numpy.ones(len(self.cluster_to_seq), dtype=numpy.bool)

        for (component, range_min, range_max) in constraints:
            if range_min is not None:
                bitmap &= self.projection[:, component] > range_min
            if range_max is not None:
                bitmap &= self.projection[:, component] < range_max

        clusters = bitmap.nonzero()[0]
        clusters = [self.cl_index_inv[c] for c in clusters]
        
        return clusters


    def prune_factors(self, factors):
        """Prune all but the specified factors from factor_index and
        cluster_fact.  This is useful when re-using a pickled matrix
        of all.  Otherwise, don't spend the time calculating them in
        the first place."""

        # be careful with indexing, in case factors is unordered
        new_fact_index = dict( (k,i) for i,k in enumerate(factors))
        #print factors
        #print enumerate(factors)
        #print new_fact_index

        fact_i = [ self.fact_index[fact] for fact, newi in 
                   sorted(new_fact_index.items(), key=lambda a: a[1])]

        self.cluster_fact = self.cluster_fact[:, fact_i]
        self.fact_index = new_fact_index
        
        return

if __name__ == "__main__":

    pyplot.ioff() # turn off figure updates
    pyplot.rc('font', size=10, family = 'serif',
              serif = ['Computer Modern Roman']
              )

    # makes underscores a pain
    pyplot.rc('text', usetex=True)
    pyplot.rc('legend', fontsize = 10)
    
    pq = pfamq.pfamq()

    date = time.strftime('%Y%m%d')

    cr_id = None
    clustering_type = 'hierarchical'

    if False:  # Just the human and mouse families
        set_id = 109 # ppod human, mouse
        family_set_name = 'ppod_20_cleanqfo2'
        #family_set_name = 'hgnc_apr10_selected'

        title_prefix= "family_set_name: %s, set_id: %d" % (family_set_name,set_id)
        fname_prefix= date+"family_set_name_%s_set_id_%d" % (family_set_name,set_id)
        print title_prefix
        
        fq = familyq.familyq(family_set_name=family_set_name)
        clustering = fq.fetch_families()[0]
        
        annotation, seq_map = pq.fetch_domain_map(
            set_id = set_id,
            seq_ids = tuple(reduce(
            lambda a,b:a+b, [list(clustering[fam]) for fam in clustering])))

    # Full mouse and human set
    if False:
        set_id = 109   # ppod human, mouse
        cr_id = 475    # clustering using the full 600k set.  Correct log calculation
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

        # NC >= 0.425 human and mouse.  Cluster run 475
        family_categories = [
            ('ACSL (1)', 'red', 'o',
                     (22313659,
                      )),
            ('FGF (1)', 'red', 'v', (
                    22296838,
                    )),
            ('FOX (1)', 'red', '*', (
                    22288496,
                    )),
            ('Tbox (1)', 'red', 's', (
                    22305088,
                    )),
            ('TNF (11)', 'red', 'D', (
                    22314154, 22315129, 22270027, 22287177, 22261463, 22298474,
                    22170530, 22172871, 22244730, 22215648, 22291349,
                    )),
            ('WNT (1)', 'red', 'h', 
                    (22286892,
                     )),

            ('DVL (1)', 'cyan', 'o', (
                    22287331,
                    )),
            ('GATA (1)', 'cyan', 'v', (
                22311997,
                )),
            ('KIR (1)', 'cyan', '*', (
                22305526,
                )),
            ('Notch (1)', 'cyan', 's', (
                22314034,
                )),
            ('TRAF (1)', 'cyan', 'D', (
                22312291,
                )),
            ('USP (4)', 'cyan', 'h', (
                22313794, 22267491, 22301080, 22311108,
                )),
            
            ('ADAM (2)', 'orange', 'o', (
                22299624, 22305655,
                )),
            ('Kinase (31)', 'orange', 'v', (
                22315320, 22315736, 22303229, 22300635, 22303550, 22300112,
                22313490, 22302611, 22310422, 22305757, 22277080, 22260493,
                22299807, 22309362, 22307724, 22315149, 22292735, 22293439,
                22299098, 22301173, 22270377, 22280490, 22305008, 22306424,
                22308631, 22311141, 22309205, 22282324, 22300028, 22304974,
                22310621,
                )),
            ('Kinesin (4)', 'orange', '*', (
                22315618, 22315593, 21281259, 22311367,
                )),
            ('Myosin (1)', 'orange', 's', (
                22309053,
                )),
            ('Laminin (1)', 'orange', 'D', (
                22298677,
                )),
            ('PDE (4)', 'orange', 'h', (
                22310871, 22305574, 22062107, 22182893,
                )),
            ('SEMA (1)', 'orange', '^', (
                22313103,
                )),
            ('TNFR (10)', 'orange', 'p', (
                22315702, 22296245, 22279088, 22187182, 22185181, 22258821,
                22301119, 22233521, 22304998, 22309449,
                )),
            ]

    if False:
        set_id = 109   # ppod human, mouse
        cr_id = 492    # clustering using only human, mouse
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

        # NC >= 0.425 human and mouse clustering.  Cluster run 492
        family_categories = [
            ('ACSL (1)', 'red', 'o', (
                    21313189, 
                      )),
            ('FGF (1)', 'red', 'v', (
                    21315146,
                    )),
            ('FOX (1)', 'red', '*', (
                    21313208,
                    )),
            ('Tbox (1)', 'red', 's', (
                    21311153,
                    )),
            ('TNF (10)', 'red', 'D', (
                    21316487, 21272651, 21316821, 21299390, 21300913, 21310500, 
                    21307641, 21308486, 21315211, 21314609,
                    )),
            ('WNT (1)', 'red', 'h', (
                    21303249,
                     )),

            ('DVL (1)', 'cyan', 'o', (
                    21304630,
                    )),
            ('GATA (1)', 'cyan', 'v', (
                    21316577,
                )),
            ('KIR (1)', 'cyan', '*', (
                    21316514,
                )),
            ('Notch (1)', 'cyan', 's', (
                    21316692,
                )),
            ('TRAF (1)', 'cyan', 'D', (
                     21314576,
                )),
            ('USP (2)', 'cyan', 'h', (
                    21316737, 21315992, 21304961,  21311664,
                )),

            ('ADAM (2)', 'orange', 'o', (
                    21313069, 21236866,
                )),
            ('Kinase (26)', 'orange', 'v', (
                    21316419, 21316376, 21315534, 21277258, 21278407, 21314469, 
                    21316842, 21314831, 21316383, 21299608, 21301681, 21305535, 
                    21307676, 21308342, 21311039, 21284844, 21296022, 21306467, 
                    21316541, 21311060, 21316201, 21316768, 21297654, 21312761, 
                    21315620, 21314960,
                )),
            ('Kinesin (3)', 'orange', '*', (
                    21314100, 21276203, 21281259,
                )),
            ('Myosin (1)', 'orange', 's', (
                    21314369,
                )),
            ('Laminin (1)', 'orange', 'D', (
                    21316181,
                )),
            ('PDE (4)', 'orange', 'h', (
                    21313177, 21314552,  21285611,  21297469,
                )),
            ('SEMA (1)', 'orange', '^', (
                    21313870,
                )),
            ('TNFR (8)', 'orange', 'p', (
                    21316655, 21315830, 21305405, 21307847, 21308513, 21311412, 
                    21296209, 21316381,
                )),
            ]


    if False:
        set_id = 112   # ppod human
        cr_id = 493    # clustering using only human, mouse
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

        # NC >= 0.425 human only clustering.  Cluster run 493
        family_categories = [
            ('ACSL (1)', 'red', 'o', (
                     21242653,
                      )),
            ('FGF (1)', 'red', 'v', (
                    21244298,
                    )),
            ('FOX (1)', 'red', '*', (
                    21243581,
                    )),
            ('Tbox (1)', 'red', 's', (
                    21242169,
                    )),
            ('TNF (9)', 'red', 'D', (
                    21245688, 21245126, 21243962, 21215287, 21215304,
                    21229672, 21229676, 21229678, 21229679
                    )),
            ('WNT (1)', 'red', 'h', (
                    21237720,
                     )),

            ('DVL (1)', 'cyan', 'o', (
                    21238897,
                    )),
            ('GATA (1)', 'cyan', 'v', (
                    21245679,
                )),
            ('KIR (1)', 'cyan', '*', (
                    21243239,
                )),
            ('Notch (1)', 'cyan', 's', (
                    21245934,
                )),
            ('TRAF (1)', 'cyan', 'D', (
                    21244073,
                )),
            ('USP (4)', 'cyan', 'h', (
                 21245878, 21245218,  21244316,  21241838
                )),
            
            ('ADAM (1)', 'orange', 'o', (
                    21243312,             
                )),
            ('Kinase (23)', 'orange', 'v', (
                    21245461, 21245787, 21245933, 21244171, 21243163, 21244358, 
                    21244837, 21245056, 21245914, 21236259, 21241815, 21245147, 
                    21245524, 21245639, 21213153, 21220069, 21225284, 21225852, 
                    21226687, 21227798, 21228303, 21228595, 21229987
                )),
            ('Kinesin (1)', 'orange', '*', (
                    21244058,
                )),
            ('Myosin (1)', 'orange', 's', (
                    21243970,
                )),
            ('Laminin (1)', 'orange', 'D', (
                    21245934,
                )),
            ('PDE (4)', 'orange', 'h', (
                    21242946,  21244041,  21236130,  21225050
                )),
            ('SEMA (1)', 'orange', '^', (
                    21243430,
                )),
            ('TNFR (9)', 'orange', 'p', (
                    21245504, 21241020, 21245222, 21245428, 21229701, 21229704,
                    21229705, 21229822, 21229823
                )),
            ]

    if False:
        set_id = 115   # ppod mouse
        cr_id = 494    # clustering using only human, mouse
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

        # NC >= 0.425 mouse only clustering.  Cluster run 494
        family_categories = [
            ('ACSL (1)', 'red', 'o', (
                    21295164,
                      )),
            ('FGF (1)', 'red', 'v', (
                    21296685,
                    )),
            ('FOX (1)', 'red', '*', (
                    21294736,
                    )),
            ('Tbox (1)', 'red', 's', (
                    21293390,
                    )),
            ('TNF (10)', 'red', 'D', (
                    21297858, 21237649, 21246494, 21246528, 21272651, 21272667,
                    21272672, 21272673, 21296374, 21297944
                    )),
            ('WNT (1)', 'red', 'h', (
                    21288949,
                     )),

            ('DVL (1)', 'cyan', 'o', (
                    21290200,
                    )),
            ('GATA (1)', 'cyan', 'v', (
                    21298036,
                )),
            ('KIR (1)', 'cyan', '*', (
                    21297865,
                )),
            ('Notch (1)', 'cyan', 's', (
                    21298136,
                )),
            ('TRAF (1)', 'cyan', 'D', (
                    21296351,
                )),
            ('USP (2)', 'cyan', 'h', (
                    21298077,  21295032,
                )),

            ('ADAM (2)', 'orange', 'o', (
                    21294409, 21236866,
                )),
            ('Kinase (25)', 'orange', 'v', (
                    21297511, 21242865, 21245103, 21254660, 21263932, 21266608, 
                    21268818, 21269909, 21270469, 21273295, 21274691, 21277258, 
                    21278407, 21284351, 21287176, 21293652, 21294100, 21295548, 
                    21295621, 21296492, 21296731, 21296991, 21297697, 21297851, 
                    21297997
                )),
            ('Kinesin (4)', 'orange', '*', (
                    21295574, 21276203, 21281259, 21298163
                )),
            ('Myosin (1)', 'orange', 's', (
                    21295971,
                )),
            ('Laminin (1)', 'orange', 'D', (
                    21298136,
                )),
            ('PDE (4)', 'orange', 'h', (
                    21294829, 21263461, 21287346, 21295936
                )),
            ('SEMA (1)', 'orange', '^', (
                    21295544,
                )),
            ('TNFR (9)', 'orange', 'p', (
                    21298085, 21272710, 21272715, 21272718, 21272745, 21272977, 
                    21272978, 21297198, 21297720
                )),
            ]


    # S cerevisiae
    if False:
        set_id = 111
        cr_id = 475      # clustering using the full 600k set.  Correct log calculation
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

    # S purpuratus
    if False:
        set_id = 114
        cr_id = 475      # clustering using the full 600k set.  Correct log calculation
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

    # C elegans
    if False:
        set_id = 116
        cr_id = 475      # clustering using the full 600k set.  Correct log calculation
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)


    # Drosophila
    if True:
        set_id = 117
        cr_id = 475      # clustering using the full 600k set.  Correct log calculation
        nc_score = 0.425 # 0.2 is near the optimal on the family set, by F

        title_prefix = "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)g" % locals()
        fname_prefix = date+"_set_id_%(set_id)d_cr_id_%(cr_id)d_nc_score_%(nc_score)g" % locals()
        print title_prefix
        
        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)


    ########
    fact_inst = mi_factor( clustering, annotation,
                           cluster_run_id = cr_id,
                           clustering_type = clustering_type)
 
    if False:
        fact_inst.prune_factors(['med_clmax_less_mi', # -0.28, 0.1
                                 'mean_seq_length',   # 0.05, 0
                                 #'frac_edges_bit_score',
                                 'cluster_max',       # 0.05, 0.22
                                 'med_clmax_less_psmi', # 0.05, 0.22
                                 'num_domains',       # 0.05, 0.16 Must include for pca hack
                                 'mean_num_domains',  # 0.1, -0.02

                                 'med_domain_mi_psmi',# 0.28, -0.08
                                 #'med_domain_mi',     # 0.28, 0
                                 'max_domain_mi',     # 0.28, 0
                                 
                                 # relevant only to an actual clustering
                                 #'density_bit_score', # 0.05, -0.12                             
                                 #'density',           # -0.07, -0.25  #Must include for pca hack
                                 #'mean_score',
                                 ])
                             
    fact_inst.calculate_pca()

    #################################
    # Plotting
    #################################
    if False:
        comp_x = 0
        comp_y = 1
        lfname_prefix = fname_prefix + "_pca_data_scatter_families_%d_%d" % (comp_x, comp_y)

        pyplot.rc('figure', figsize=(6,5))
        fact_inst.plot_pca_data_scatter( 
            comp_x = comp_x,
            comp_y = comp_y,
            cluster_categories = family_categories,
            subplots_adjust = {'bottom': 0.08, 'top': 0.99, 'right': 0.99, 'left':0.09},
            draw_title = False,
            dpi = 600,
            filename=lfname_prefix + ".png",
            legend_loc = 'lower right'
            )

        pyplot.rc('figure', figsize=(3.2,3.2))
        fact_inst.plot_pca_data_scatter( 
            comp_x = comp_x,
            comp_y = comp_y,
            cluster_categories = family_categories,
            subplots_adjust = {'bottom': 0.11, 'top': 0.99, 'right': 0.98, 'left':0.17},
            draw_title = False,
            dpi = 600,
            xlim = (-1.1, 5.5),
            ylim = (-5.5, 6.5),
            filename=lfname_prefix + "_zoom0.png",
            draw_legend=False
            )

        fact_inst.plot_pca_data_scatter( 
            comp_x = comp_x,
            comp_y = comp_y,
            cluster_categories = family_categories,
            subplots_adjust = {'bottom': 0.11, 'top': 0.99, 'right': 0.98, 'left':0.17},
            draw_title = False,
            dpi = 600,
            xlim = (-1.1, 1.75),
            ylim = (-2.6, 3.25),
            filename=lfname_prefix + "_zoom1.png",
            draw_legend=False
            )

        fact_inst.plot_pca_data_scatter( 
            comp_x = comp_x,
            comp_y = comp_y,
            cluster_categories = family_categories,
            subplots_adjust = {'bottom': 0.11, 'top': 0.99, 'right': 0.98, 'left':0.17},
            draw_title = False,
            dpi = 600,
            xlim = (-0.86, 0),
            ylim = (-0.5, 0.65),
            filename=lfname_prefix + "_zoom2.png",
            draw_legend=False
            )
    
    if False:
        comp_x = 0
        comp_y = 1

        for fact in ('cluster_size',
                     'num_domains', 
                     'mean_num_domains',
                     'med_clmax_less_mi', 
                     'med_clmax_less_psmi', 
                     'med_domain_psmi'
                     ):
            lfname_prefix = fname_prefix + "_pca_data_scatter_%d_%d_%s" % (comp_x, comp_y, fact)

            pyplot.rc('figure', figsize=(6,5))
            fact_inst.plot_pca_data_scatter( 
                comp_x = comp_x,
                comp_y = comp_y,
                color_by_factor = fact,
                subplots_adjust = {'bottom': 0.08, 'top': 0.98, 'right': 0.93, 'left':0.09},
                draw_title = False,
                dpi = 600,
                filename=lfname_prefix + ".png",
                draw_legend = False
                )

            pyplot.rc('figure', figsize=(3.2,3.2))
            fact_inst.plot_pca_data_scatter( 
                comp_x = comp_x,
                comp_y = comp_y,
                color_by_factor = fact,
                subplots_adjust = {'bottom': 0.11, 'top': 0.99, 'right': 0.95, 'left':0.17},
                draw_title = False,
                dpi = 600,
                xlim = (-1.1, 5.5),
                ylim = (-5.5, 6.5),
                filename=lfname_prefix + "_zoom0.png",
                draw_colorbar=False
                )
            
            fact_inst.plot_pca_data_scatter( 
                comp_x = comp_x,
                comp_y = comp_y,
                color_by_factor = fact,
                subplots_adjust = {'bottom': 0.11, 'top': 0.99, 'right': 0.95, 'left':0.17},
                draw_title = False,
                dpi = 600,
                xlim = (-1.1, 1.75),
                ylim = (-2.6, 3.25),
                filename=lfname_prefix + "_zoom1.png",
                draw_colorbar=False
                )

            fact_inst.plot_pca_data_scatter( 
                comp_x = comp_x,
                comp_y = comp_y,
                color_by_factor = fact,
                subplots_adjust = {'bottom': 0.11, 'top': 0.99, 'right': 0.95, 'left':0.17},
                draw_title = False,
                dpi = 600,
                xlim = (-0.86, 0),
                ylim = (-0.5, 0.65),
                filename=lfname_prefix + "_zoom2.png",
                draw_colorbar=False
                )


    if False:
        nsamples = 1000
        fraction = 0.75

        pyplot.rc('figure', figsize=(6,3))
        
        fact_inst.plot_pca_component_scree(
            title_prefix=title_prefix,
            dpi=150,
            filename=fname_prefix+"_pca_component_scree.pdf",
            nsamples=nsamples, sample_fraction=fraction,
            draw_title=False,
            subplots_adjust={'bottom': 0.12, 'top': 0.99, 'right': 0.99, 'left':0.07})

    if True:
        nsamples = 1000
        fraction = 0.75

        pyplot.rc('figure', figsize=(6, 5))
        Wt_array, frac_array = fact_inst.plot_pca_factor_stability(
            nsamples, fraction,
            comp_x = 0,
            comp_y = 1,
            title_prefix = title_prefix, dpi=100,
            filename=fname_prefix+"_pca_factor_stability_%d_%f_%d_%d.pdf" % (
                nsamples, fraction, 0, 1),
            xlim = (-0.3, 0.5) if cr_id==475 and set_id==109 else (-0.1,0.6) if cr_id==475 and set_id==117 else None,
            ylim = (-0.25, 0.3) if cr_id==475 and set_id==109 else (-0.5,0.2) if cr_id==475 and set_id==117 else None,
            subplots_adjust={'bottom': 0.09, 'top': 0.985, 'right': 0.98, 'left':0.1},
            draw_title=False
            )

        pyplot.rc('figure', figsize=(3.2,3.2))
        for comp_x, comp_y in ((0,2), (1,2)):
            Wt_array, frac_array = fact_inst.plot_pca_factor_stability(
                nsamples, fraction,
                comp_x = comp_x,
                comp_y = comp_y,
                title_prefix = title_prefix, dpi=100,
                filename=fname_prefix+"_pca_factor_stability_%d_%f_%d_%d.pdf" % (
                nsamples, fraction, comp_x, comp_y),
                draw_title=False,
                subplots_adjust={'bottom': 0.11, 'top': 0.98, 'right': 0.91, 'left':0.19},
                xlim = (-0.3, 0.65) if cr_id==475 and set_id==109 else (-0.1,0.6) if cr_id==475 and set_id==117 and comp_x==0 else None,
                ylim = (-0.15, 0.2) if cr_id==475 and set_id==109 else None if cr_id==475 and set_id==117 else None,
                )

    if False:

        for comp_x, comp_y, figsize in ((0,1, (5.75,5)), 
                                        (0,2, (3.2,2.6)), 
                                        (1,2, (3.2,2.6))
                               ):
            if figsize[0] == 5.75:
                subplots_adjust={'bottom': 0.08, 'top': 0.99, 'right': 0.95, 'left':0.05}
            else:
                subplots_adjust={'bottom': 0.15, 'top': 0.99, 'right': 0.95, 'left':0.05}
            
            pyplot.rc('figure', figsize=figsize)
            xy = fact_inst.plot_pca_data_heatmap(
                comp_x = comp_x,
                comp_y = comp_y,
                title_prefix = title_prefix,
                dpi = 150,
                filename = fname_prefix + "_pca_heatmap_%d_%d.pdf" % (comp_x, comp_y),
                hres=100, vres=100,
                draw_title=False,
                subplots_adjust=subplots_adjust
                )

            #fact_inst.plot_pca_data_scatter( 
            #    comp_x = comp_x,
            #    comp_y = comp_y,
            #    title_prefix = title_prefix,
            #    dpi = 150,
            #    filename = fname_prefix + "_pca_data_scatter_%d_%d.pdf" % (comp_x, comp_y),
            #    draw_title=False,
            #    subplots_adjust={'bottom': 0.11, 'top': 0.99, 'right': 0.91, 'left':0.17},
            #    )
                
    if False:
        pyplot.rc('figure', figsize=(9,8))
        #for fact_x, fact_y in (('cluster_max','num_domains'),
        #                       ('cluster_max','mean_num_domains'),
        #                       ('cluster_max', 'max_domain_mi'),
        #                       ('max_domain_mi', 'med_clmax_less_mi'),
        #                       ('max_domain_mi', 'density'),
        #                       ('density', 'density_bit_score'),
        #                       ('density', 'cluster_max'),
        #                       ('density', 'mean_seq_length'),
        #                       ('density', 'mean_num_domains'),
        #                       ('density_bit_score', 'mean_seq_length'),
        #                       ):
        factors = fact_inst.fact_index.keys()
        for i, fact_x in enumerate(factors):
            for fact_y in factors[i+1:]:
                X, Y, xy = fact_inst.plot_factor_heatmap(
                    fact_x = fact_x,
                    fact_y = fact_y,
                    title_prefix = title_prefix,
                    dpi = 100,
                    filename = fname_prefix + "_factor_heatmap_%s_%s.png" % (fact_x, fact_y),
                    hres=100, vres=100)

                fact_inst.plot_factor_scatter(
                    fact_x = fact_x,
                    fact_y = fact_y,
                    title_prefix = title_prefix,
                    dpi = 100,
                    filename = fname_prefix + "_factor_scatter_%s_%s.png" % (fact_x, fact_y))


    if False:
        pyplot.rc('figure', figsize=(6.5,4))
        #pyplot.rc('text', usetex=False)  # breaks for some reason
        
        for comp in (1,2):
            order = fact_inst.projection[:,comp].argsort()
            clusters = [fact_inst.cl_index_inv[i] for i in order]
            f = fact_inst.scatter_cluster_mi_specific(
                cluster_list=clusters,
                dpi=300, # png
                grid=False,
                xticks=False,
                pointsize=0.5,
                xlimit=(-100, 10100), # fit the points well inside the axis frame
                filename = fname_prefix + "_scatter_cluster_mi_specific_pca%d.png" % comp
                )

        if False:
            pyplot.rc('figure', figsize=(6.5,4))
            #pyplot.rc('text', usetex=False)  # breaks for some reason

            fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[:5300],
            dpi=300, # png
            grid=True,
            xticks=False,
            pointsize=0.5,
            xlimit=(-20, 5320), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_0_5300.png"
            )

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[:1000],
            dpi=300, # png
            grid=False,
            xticks=True,
            pointsize=0.5,
            xlimit=(-10, 1000), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_0_1000.png")

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[1000:2000],
            dpi=300, # png
            grid=True,
            xticks=False,
            pointsize=0.5,
            xlimit=(-10, 1000), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_1000_2000.png")

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[1500:1600],
            dpi=150, 
            grid=True,
            xticks=False,
            pointsize=5,
            xlimit=(-1, 101), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_1500_1600.pdf")

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[1600:1700],
            dpi=150, 
            grid=True,
            xticks=False,
            pointsize=10,
            xlimit=(-1, 101), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_1600_1700.pdf")

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[5000:5300],
            dpi=300, # png
            grid=True,
            xticks=False,
            pointsize=10,
            xlimit=(-5, 305), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_5000_5300.png")

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[5225:5300],
            dpi=300, # png
            grid=True,
            xticks=False,
            pointsize=10,
            xlimit=(-1, 76), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_5225_5300.png")

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[:45],
            dpi=150, # pdf
            grid=True,
            pointsize=10,
            xlimit=(-0.5, 44.5), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_0_44.pdf"
            )

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[4900:4950],
            dpi=150, # pdf
            grid=True,
            pointsize=10,
            xlimit=(-0.5, 44.5), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_4900_4944.pdf"
            )

        fact_inst.scatter_cluster_mi_specific(
            cluster_list=clusters[5200:5250],
            dpi=150, # pdf
            grid=True,
            pointsize=10,
            xlimit=(-0.5, 44.5), # fit the points well inside the axis frame
            filename = fname_prefix + "_scatter_cluster_mi_specific_pca0_5200_5244.pdf"
            )

# projection = fact_inst.pca.project(fact_inst.cluster_fact)
# projection[:,0] > 60).nonzero()[0][0]
# fact_inst.cl_index_inv[6826]

# for cluster in fact_inst.constrain_clusters( [(0,None,0),(1,2,None)]):
#     print fact_inst.print_cluster_data(cluster)

# order = fact_inst.projection[:,0].argsort()
# clusters = [fact_inst.cl_index_inv[i] for i in order]
# fact_inst.scatter_cluster_mi_specific(cluster_list=clusters)