#!/usr/bin/env python

# Jacob Joseph
# 11 June 2008

from DurandDB import pfamq
import cluster_sql
import numpy
from cluster_stat_brohee import brohee_stat

#import cPickle
#from IPython.Shell import IPShellEmbed

class stat(brohee_stat):
    """Extended network clustering statistics.  While often similar to
    Brohee's, these account for limited annotation of the set of
    proteins."""

    def __init__(self, **args):
        brohee_stat.__init__(self, **args)

    def overall_recall(self, family_set=None):
        """Weighted (by family size and cluster representation)
        average of recall"""

        T = self.cont
        row_sums = numpy.sum(T, axis=1)

        if family_set is None:
            family_set = self.family_ord.keys()

        recsum = 0.0
        norm_sum = 0
        for family_abbrev in family_set:
            i = self.family_ord[family_abbrev]
            recsum += row_sums[i] * self.family_wise_recall(
                family_abbrev, weighted_avg=True)
            norm_sum += self.N[i]
            
        # normalize by the size of all families
        rec_wt = recsum / norm_sum
        return rec_wt

    def family_wise_precision(self, family_abbrev):
        """Precision of a family weighted over all clusters"""

        T = self.cont
        i = self.family_ord[family_abbrev]

        nonzero_clusters = numpy.nonzero( T[i,:])[0]
        col_sums = self.M

        prec = numpy.sum( numpy.array( T[i, nonzero_clusters]**2, dtype=float)
                          / col_sums[nonzero_clusters]) / self.N[i]
        return prec

    def overall_precision(self, family_set=None):
        """Weighted (by family size and cluster representation)
        average of precision"""

        T = self.cont
        row_sums = numpy.sum(T, axis=1)

        if family_set is None:
            family_set = self.family_ord.keys()
        
        precsum = 0.0
        norm_sum = 0
        for family_abbrev in family_set:
            i = self.family_ord[family_abbrev]
            precsum += row_sums[i] * self.family_wise_precision( family_abbrev)
            norm_sum += self.N[i]

        # normalize by the size of all families
        prec_wt = precsum / norm_sum
        return prec_wt

    ######################################
    ## F - statistic
    ######################################
    def F(self, family_abbrev, cluster_id):
        """F-statistic: harmonic mean of precision (ppv) and recall (sn)"""

        rec = self.sensitivity( family_abbrev, cluster_id)
        prec = self.precision( family_abbrev, cluster_id, use_true_csize=True)
        F = 2 * prec * rec / (prec + rec)
        return F

    def family_wise_F( self, family_abbrev):
        """Weighted (by cluster representation) average of F statistic over all
        clusters that represent a family"""

        i = self.family_ord[ family_abbrev]
        T = self.cont

        row = T[i,:]
        nonzero_clusters = numpy.nonzero( row)[0]

        F_sum = 0.0
        for j in nonzero_clusters:
            cluster_id = self.cluster_ord_rev[j]
            F_sum += row[j] * self.F(family_abbrev, cluster_id)

        #print family_abbrev, F_sum, numpy.sum(row)

        F_wt = F_sum / numpy.sum(row)
        return F_wt
   
    def overall_F(self, family_set=None):
        """Weighted (by family size and cluster representation)
        average of F statistic"""

        T = self.cont
        row_sums = numpy.sum(T, axis=1)

        if family_set is None:
            family_set = self.family_ord.keys()

        F_sum = 0.0
        norm_sum = 0
        for family_abbrev in family_set:
            i = self.family_ord[family_abbrev]
            F_sum += row_sums[i] * self.family_wise_F( family_abbrev)
            norm_sum += self.N[i]

        # normalize by the size of all families
        F_wt = F_sum / norm_sum
        return F_wt


    ######################################
    ## Accuracy
    ######################################    
    def accuracy( self, family_abbrev, cluster_id):
        """Geometric mean of precision and recall using weighted
        Precision and Recall"""

        rec = self.sensitivity( family_abbrev, cluster_id)
        prec = self.precision( family_abbrev, cluster_id, use_true_csize=True)
        acc = numpy.sqrt( prec * rec)
        return acc

    def family_wise_accuracy( self, family_abbrev):
        """Average of accuracy weighted by family size"""
        
        i = self.family_ord[ family_abbrev]
        T = self.cont

        row = T[i,:]
        nonzero_clusters = numpy.nonzero( row)[0]

        acc_sum = 0.0
        for j in nonzero_clusters:
            cluster_id = self.cluster_ord_rev[j]
            acc_sum += row[j] * self.accuracy(family_abbrev, cluster_id)

        acc_wt = acc_sum / numpy.sum(row)
        return acc_wt

    def overall_accuracy(self, family_set=None):
        """Weighted average of accuracy"""

        T = self.cont
        row_sums = numpy.sum(T, axis=1)

        if family_set is None:
            family_set = self.family_ord.keys()

        acc_sum = 0.0
        norm_sum = 0
        for family_abbrev in family_set:
            i = self.family_ord[family_abbrev]
            acc_sum += row_sums[i] * self.family_wise_accuracy( family_abbrev)
            norm_sum += self.N[i]

        # normalize by the size of all families
        acc_wt = acc_sum / norm_sum
        return acc_wt
    

    ######################################
    ## "Unity"
    ######################################
    def family_unity(self, family_abbrev):
        """Family unity: weighted average of fraction across
        the clusters that represent a family"""

        i = self.family_ord[ family_abbrev]
        T = self.cont
        row = numpy.array(T[i,:], dtype=float)
        row_fraction = row / numpy.sum( row)

        frag_wt = numpy.sum( row_fraction**2)
        return frag_wt


    def overall_unity(self, family_set=None):
        """Average of family unity weighted by family_size"""
        T = self.cont

        if family_set is None:
            family_set = self.family_ord.keys()

        # row fractions
        #row_sums_t = numpy.array( [numpy.sum(T, axis=1)]).transpose()
        #row_fractions = numpy.array(T, dtype=float) / row_sums_t

        #row_wt = numpy.sum(numpy.array(T, dtype=float)**2 / row_sums_t
        #                   ) / numpy.sum(row_sums_t)
        
        frag_sum = 0.0
        norm_sum = 0
        for family_abbrev in family_set:
            i = self.family_ord[ family_abbrev]
            frag_sum += self.N[i] * self.family_unity(family_abbrev)
            norm_sum += self.N[i]
            
        frac = frag_sum / norm_sum

        #if frac != row_wt:
        #    print "inequal row_wt and frac:", row_wt, frac
        return frac
    

    def family_fraction(self, family_abbrev):
        """Family size / size of union of containing clusters"""

        T = self.cont
        i = self.family_ord[ family_abbrev]
        
        nonzero_clusters = numpy.nonzero( T[i,:])[0]
        col_sums = self.M

        frac = float(numpy.sum( T[i,:])) / numpy.sum(col_sums[ nonzero_clusters])
        return frac


    def overall_fraction(self, family_set=None):
        """Weighted (by family size and cluster representation)
        average of family fraction"""

        T = self.cont
        row_sums = numpy.sum(T, axis=1)

        if family_set is None:
            family_set = self.family_ord.keys()

        precsum = 0.0
        norm_sum = 0
        for family_abbrev in family_set:
            i = self.family_ord[ family_abbrev]
            precsum += row_sums[i] * self.family_fraction( family_abbrev)
            norm_sum += row_sums[i]

        # normalize by the size of all families
        prec_wt = precsum / norm_sum
        return prec_wt

    ####################################
    # PFAM statistics
    ####################################
    def build_pfam_maps(self, set_id=79):
        """Return clust_to_pfam, pfam_to_clust, ident_map, promisc_map dictionaries"""
        pfq = pfamq.pfamq( cacheq=True)

        clust_to_pfam = {}
        pfam_to_clust = {}

        for (cluster_id, seq_id_list) in self.cluster_sets.iteritems():
            domains = pfq.get_domains( seq_id_list)
            clust_to_pfam[ cluster_id] = set(domains)
            
            for domain in domains:
                if not domain in pfam_to_clust:
                    pfam_to_clust[domain] = set()
                pfam_to_clust[domain].add(cluster_id)

        ident_map = pfq.get_identity_map()
        promis_map = pfq.get_promiscuity_map(set_id=set_id)
        
        return (clust_to_pfam, pfam_to_clust, ident_map, promis_map)
                
    def pfam_clust_cnt(self):
        """Return an ordered list of domains, and a 3 column numpy array:
        (# clusters containing domain, domain identity, domain promiscuity)"""

        (clust_to_pfam, pfam_to_clust,
         ident_map, promis_map ) = self.build_pfam_maps()

        # We can only consider domains that are in both Uniprot and Pfam22
        pfam22_set = set(ident_map.keys())
        uniprot_set = set(pfam_to_clust.keys())
        domain_list = sorted( pfam22_set.intersection( uniprot_set))

        #print "Ignoring PFAM domains:", pfam22_set.symmetric_difference( uniprot_set)

        cnt_array = numpy.zeros( (len(domain_list), 3), dtype=float)
        for (i,domain) in enumerate(domain_list):
            cnt_array[i] = ( len(pfam_to_clust[domain]),
                             ident_map[domain],
                             promis_map[domain])
        return (domain_list, cnt_array)

                    
    def get_overall_stats(self, family_set=None):
        if not self.cont_init:
            self.build_contingency()
        
        sdict = {}

        if family_set is None:
            pass
        elif isinstance(family_set, (str,)):
            family_set = (family_set,)
        elif isinstance(family_set, (set, tuple, list)):
            pass
        else:
            assert False, "Unrecognized family_set input: %s" % family_set

        if family_set==('ALL',):
            family_set = self.family_ord.keys()
        elif family_set==('ALL-kin',):
            family_set = set( self.family_ord.keys()) - set(('Kinase',))


        # Brohee class doesn't accept family_set parameter
        # All 'overall_*' functions do.
        #if family_set is None:
            # sdict['Clustering SN'] = self.clustering_wise_sn()
            # sdict['Clustering PPV'] = self.clustering_wise_ppv()
            #sdict['Clustering PPVt'] = self.clustering_wise_ppv(
            #    use_true_csize=True)
            # sdict['Accuracy'] = self.brohee_accuracy()
            #sdict['Accuracyt'] = self.brohee_accuracy(use_true_csize=True)
            # sdict['Cluster SEP'] = self.avg_cluster_wise_sep()
            # sdict['Cluster SEPt'] = self.avg_cluster_wise_sep(
            #    use_true_csize=True)
            # sdict['Family SEP'] = self.avg_complex_wise_sep()
            # sdict['Family SEPt'] = self.avg_complex_wise_sep(
            #    use_true_csize=True)
            # sdict['Family SEPst'] = self.avg_complex_wise_sep(
            #    use_true_csize=True, max_single_sep=True)
            # sdict['Clustering SEP'] = self.clustering_wise_sep()
            #sdict['Clustering SEPt'] = self.clustering_wise_sep(
            #    use_true_csize=True)
            # sdict['Clustering SEPst'] = self.clustering_wise_sep(
            #    use_true_csize=True, max_single_sep=True)

        #sdict['Unity'] = self.overall_unity(family_set=family_set)
        sdict['F'] = self.overall_F(family_set=family_set)
        sdict['Precision'] = self.overall_precision(family_set=family_set)
        sdict['Recall'] = self.overall_recall(family_set=family_set)
        #sdict['Fraction'] = self.overall_fraction(family_set=family_set)
        #sdict['Accuracy'] = self.overall_accuracy(family_set=family_set)
        return sdict

    def __str__( self):
        s = ""
        s += "======================================================\n"
        s += "Comment: " + str(self.csql.cluster_comment) + "\n"
        s += "Parameters: " + str(self.csql.cluster_params) + "\n"
        s += "======================================================\n"
        #s += "    Clustering-wise sensitivity: %0.4f\n" % self.clustering_wise_sn()
        #s += "            Clustering-wise PPV: %0.4f\n" % self.clustering_wise_ppv()
        #s += "                       Accuracy: %0.4f\n" % self.brohee_accuracy()
        #s += "Average cluster-wise separation: %0.4f\n" % self.avg_cluster_wise_sep()
        #s += "Average complex-wise separation: %0.4f\n" % self.avg_complex_wise_sep()
        #s += "     Clustering-wise separation: %0.4f\n" % self.clustering_wise_sep()
        #s += "                       Overall F: %0.4f\n" % self.overall_F()
        #s += "-----------------------------------------------------\n"
        s += "Using true cluster size (not column sum):\n"
        s += "            Clustering-wise PPV: %0.4f\n" % self.clustering_wise_ppv(use_true_csize=True)
        s += "                Brohee Accuracy: %0.4f\n" % self.brohee_accuracy(use_true_csize=True)
        s += "              Weighted Accuracy: %0.4f\n" % self.overall_accuracy()
        #s += "Average cluster-wise separation: %0.4f\n" % self.avg_cluster_wise_sep(use_true_csize=True)
        #s += "Average complex-wise separation: %0.4f\n" % self.avg_complex_wise_sep(use_true_csize=True)
        #s += "     Clustering-wise separation: %0.4f\n" % self.clustering_wise_sep(use_true_csize=True)
        s += "             Weighted Precision: %0.4f\n" % self.overall_precision()
        s += "                Weighted Recall: %0.4f\n" % self.overall_recall()
        s += "                     Weighted F: %0.4f\n" % self.overall_F()
        s += "                  Overall Unity: %0.4f\n" % self.overall_unity()
        s += "        Overall Family Fraction: %0.4f\n" % self.overall_fraction()
        #s += "-----------------------------------------------------\n"
        #s += "Using true cluster size and max single cluster separation:\n"
        #s += "Average complex-wise separation: %0.4f\n" % self.avg_complex_wise_sep(use_true_csize=True,
        #                                                                            max_single_sep=True)
        #s += "     Clustering-wise separation: %0.4f\n" % self.clustering_wise_sep(use_true_csize=True)
        s += "-----------------------------------------------------\n"
        s += "                 Total Clusters: %d\n" % len(self.cluster_ord)
        s += " Annotated (non-empty) clusters: %d\n" % self.num_annotated_clusters()
        s += "\n"
        s += "-----------------------------------------------------\n"
        s += "Family-specific statistics:\n"
        s += "  Family  Size  Row Sum  Clusters  Clusters Size     Frac   Unity  PRECwt   RECwt  RECmax       F\n"
        template = "%8s  %4d     %4d      %4d          %5d   %0.4f  %0.4f  %0.4f  %0.4f  %0.4f  %0.4f\n"
        for (family_abbrev,i) in sorted(self.family_ord.items(), key=lambda a:a[0]):
            # sum of lengths of all clusters which represent this family
            clusters_size = 0
            for j in numpy.nonzero( self.cont[i,:])[0]:
                clusters_size += len( self.cluster_sets[ self.cluster_ord_rev[j] ])
            clusters_size2 = numpy.sum( self.M[ numpy.nonzero(self.cont[i,:]) ])
            if clusters_size != clusters_size2:
                print "clusters size incorrect ", family_abbrev, clusters_size, clusters_size2
            
            s += template % (family_abbrev,
                             self.N[i],
                             numpy.sum(self.cont[i,:]),
                             numpy.size(numpy.nonzero(self.cont[i,:])),
                             clusters_size,
                             self.family_fraction( family_abbrev),
                             self.family_unity( family_abbrev),
                             self.family_wise_precision( family_abbrev),
                             self.family_wise_recall( family_abbrev, weighted_avg=True),
                             self.family_wise_recall( family_abbrev),
                             self.family_wise_F( family_abbrev))

            #s += str(numpy.nonzero( self.cont[i,:])) + "\n"
            
        s += "\n"
        s += "-----------------------------------------------------\n"
        s += "Cluster-specific statistics (non-empty clusters only):\n"
        s += "  Cluster   Size  Col Sum    Families    PPVmax\n"
        template = "%9s  %5d    %5d        %4d    %0.4f %s\n"

        # list of (cluster_id, j, c_size) to sort by size
        clusters = [ (cluster_id, j, self.M[j]
                      ) for (cluster_id, j) in self.cluster_ord.items() ]
        clusters.sort(key=lambda a: a[2], reverse=True)

        family_ord_inv = dict( (v,k) for (k,v) in self.family_ord.items())

        row_cnt = 0
        for (cluster_id, j, c_size) in clusters:            
            # only display clusters with at least one annotated member
            if numpy.sum(self.cont[:,j]) < 1:
                continue

            # abbreviate the output
            #row_cnt += 1
            #if row_cnt > 30:
            #    s+= "... (pruned at 30 clusters)\n"
            #    break
            
            s += template % (cluster_id,
                             self.M[j],
                             numpy.sum(self.cont[:,j]),
                             len(numpy.nonzero(self.cont[:,j])[0]),
                             self.cluster_wise_ppv( cluster_id, use_true_csize=True),
                             [family_ord_inv[i] for i in self.cont[:,j].nonzero()[0]]
                             )
        s += "======================================================\n"
                
        return s

        
class hclust_stat(stat):
    """Clustering statistics for hierarchical clustering"""

    def __init__(self, cr_id, **args):
        self.cr_id = cr_id
        self.csql = cluster_sql.hcluster( cluster_run_id = self.cr_id, cacheq=True)
        stat.__init__(self, **args)

    def fetch_cut(self, distance, set_id_filter=None):
        assert set_id_filter is None, "set_id_filter no longer used.  The set_id in cluster_stat_brohee will always be used."

        self.cluster_sets = self.csql.cut_tree( distance,
                                                set_id_filter = self.set_id)
        self.cont_init = False
        # self.build_contingency()

    def fetch_cut_components(self, distance, br_id, nc_id, stype):
        self.cluster_sets = self.csql.cut_tree_components( distance, br_id, nc_id, stype)
        self.cont_init = False
        #self.build_contingency()


class flatclust_stat( stat):
    """Clustering statistics for flat, non-hierarhical clustering"""

    def __init__(self, cr_id, **args):
        self.cr_id = cr_id
        self.csql = cluster_sql.flatcluster( cluster_run_id = self.cr_id, cacheq=True)
        stat.__init__(self, **args)

        self.cluster_sets = self.csql.fetch_clusters()
        self.cont_init = False
        #self.build_contingency()
        

class brohee_test(stat):
    """A test of the statistics code, using data from Brohee, Table 4"""

    def __init__(self):
        self.cont = numpy.array( [[7, 0, 0, 0, 0],
                                  [0, 6, 8, 0, 0],
                                  [0, 0, 0, 14, 3],
                                  [0, 0, 0, 4, 5]], dtype=int)

        self.cont_init = True
        self.N = numpy.array([7, 14, 20, 8], dtype=int)
        self.M = numpy.array([7, 6, 8, 18, 8], dtype=int)

        self.family_ord = {'complex_1': 0, 'complex_2': 1,
                           'complex_3': 2, 'complex_4': 3 }
        self.cluster_ord = {'cluster_1': 0, 'cluster_2': 1,
                            'cluster_3': 2, 'cluster_4': 3,
                            'cluster_5': 4}


def dbg_sets(family_abbrev, init=False, cr_id=25, nc_dist=0.9):
    if init:
        h = hclust_stat( cr_id)
        h.fetch_cut( nc_dist)
        h.build_contingency()
        
    i = h.family_ord[family_abbrev]
    
    for (cluster_id, j) in h.cluster_ord.items():
        (l, seq_ids) = h.shared_count(family_abbrev, cluster_id)
        if len(seq_ids) > 0:
            print "cluster_id: %d. Length %d" % (cluster_id, l)
            print seq_ids
            
    return h