#!/usr/bin/env python

# Jacob Joseph
# 2 December 2010

# Tools to consider the correspondence of domains and identified
# clusters:
#
# * Mutual information of a clustering with each domain

import numpy
from math import log
#from DurandDB import familyq, pfamq
#from JJcluster import cluster_sql

class mutualinfo(object):
    
    """clustering: dictionary of clusters mapping to a list/set of
    seq_ids.  Each seq_id must occur in exactly one cluster.

    annotation: dictionary of annotations (e.g., domains) mapping
    to a list/set of seq_ids.  Here, a seq_id may be found in several
    annotation keys.

    Mutual information will be calculated between the clusters and
    each annotation key.
    """

    def __init__(self, clustering, annotation):
        self.cluster_to_seq = clustering
        self.annotation_to_seq = annotation

        self.seq_to_cluster = {}
        for cluster_id, seq_ids in self.cluster_to_seq.iteritems():
            for seq_id in seq_ids:
                self.seq_to_cluster[seq_id] = cluster_id

        self.seq_to_annotation = {}
        for ann, seq_ids in self.annotation_to_seq.iteritems():
            for seq_id in seq_ids:
                try:
                    self.seq_to_annotation[seq_id].add(ann)
                except:
                    self.seq_to_annotation[seq_id] = set( (ann,))


    def annotations_in_cluster(self, cluster):
        anns = set()
        for seq_id in self.cluster_to_seq[cluster]:
            anns.update( self.seq_to_annotation.get(seq_id, []))
        return anns

    def clustering_entropy_old(self):

        N = len(self.seq_to_cluster)
        #print "N", N
        
        H = 0
        for cluster in self.cluster_to_seq:
            ck = float(len(self.cluster_to_seq[cluster]))

            #print "ck", cluster, ck, ck / N * log( ck / N, 2)
            
            H += ck / N * log( ck / N, 2)

        return -H

    def clustering_entropy(self):
        sizes = [ len(cluster) for cluster in self.cluster_to_seq.values()]
        A = numpy.array( sizes, dtype=numpy.double)
        
        A = A / numpy.sum(A)
        A = A * numpy.log2(A)
        
        return -numpy.sum(A) # entropy

    def annotation_entropy(self, ann):

        N = len(self.seq_to_cluster)
        dl = float(len(self.annotation_to_seq[ann]))
        
        H = 0
        for cnt in (dl, N-dl):
            #print "cnt", cnt, cnt / N * log( cnt / N, 2)
            
            H += cnt / N * log( cnt / N, 2)

        return -H

    def joint_cluster_ann_count(self, ann):
        """Return a dictionary by cluster of how many sequences have
        the given annotation.  Only non-zero counts are included."""
        joint_cnt = {}

        for seq_id in self.annotation_to_seq[ann]:
            cluster = self.seq_to_cluster[seq_id]
            try:
                joint_cnt[cluster] += 1
            except KeyError:
                joint_cnt[cluster] = 1
        return joint_cnt

    def joint_ann_count(self, annotations, prepend_clustering=False):
        """Return a contingency table of sequences that have these
        domains.  Dimensions are in order of the list of
        annotations."""

        cont = numpy.zeros( [2]*len(annotations), dtype=numpy.int)

        assert len(annotations)==2, "!= 2 dimensions unimplemented"
        # currently works for 2 dimensions only
        # populate the contingency table

        # ann0=t, ann0=t
        ann0, ann1 = annotations
        cont[1,1] = len(self.annotation_to_seq[ann0].intersection(
            self.annotation_to_seq[ann1]))
        cont[1,0] = len(self.annotation_to_seq[ann0]) - cont[1,1]
        cont[0,1] = len(self.annotation_to_seq[ann1]) - cont[1,1]

        cont[0,0] = len(self.seq_to_cluster) - numpy.sum(cont)

        return cont

    def mutual_information_ann(self, annotations, prepend_clustering=False):

        cont = self.joint_ann_count(annotations)
        N = len(self.seq_to_cluster)

        assert N == numpy.sum(cont), "N != sum(cont)"

        mi = 0

        norm_cont = numpy.array(cont, dtype=numpy.float) / N

        # marginals over each axis
        # works only for 2 dimensions
        marginal = [numpy.sum(norm_cont, axis=i) for i in range(len(annotations))]

        # divide each by the marginal
        #tmp = norm_count / marginal[0]
        #tmp = (tmp.transpose() / marginal[1]).transpose()
        tmp = norm_cont / numpy.outer(marginal[0], marginal[1]).transpose()

        tmp = numpy.log2(tmp)

        mi = numpy.nansum(tmp * norm_cont)

        return mi


    def max_cluster_mi_single(self, cluster):
        """Return the maximum possible MI with a cluster.  This is the
        MI of the partitioning of the cluster against a single set of
        all else."""

        assert False, "Use max_cluster_mi instead"
        
        # FIXME: check that this is actually correct
        
        N = float(len(self.seq_to_cluster))
        cnt = float(len(self.cluster_to_seq[cluster]))
        
        mi = cnt / N * log( cnt / N
                            / (cnt / N)
                            / (cnt / N), 2)
        mi += (N-cnt) / N * log( (N-cnt) / N
                                 / ((N-cnt) / N)
                                 / ((N-cnt) / N), 2)
        return mi

    def max_cluster_mi(self, cluster):
        """Return the maximum possible MI with a cluster.  This
        considers the distribution of sizes of all other clusters,
        which is necessary."""

        N = float(len(self.seq_to_cluster))
        cnt = float(len(self.cluster_to_seq[cluster]))

        # only this cluster contains the domain
        mi = cnt / N * log( cnt / N
                            / (cnt / N)
                            / (cnt / N), 2)

        for cluster1 in self.cluster_to_seq:
            if cluster == cluster1: continue

            ck = float(len(self.cluster_to_seq[cluster1]))

            mi += ck / N * log( ck / N
                                / ((N-cnt) / N) # does not contain domain
                                / ( ck / N),          # in domain
                                2)
        return mi
        

    def mutual_information(self, ann):

        N = len(self.seq_to_cluster)
        ann_cnt = float(len(self.annotation_to_seq[ann]))

        joint_cnt = self.joint_cluster_ann_count(ann)

        # sum over all clusters
        mi = 0
        for cluster in self.cluster_to_seq:
            ck = float(len(self.cluster_to_seq[cluster]))

            jointk = float(joint_cnt.get(cluster, 0))
            
            for (cnt, ann_marginal) in ((ck - jointk, N - ann_cnt), # without
                                        (jointk, ann_cnt)):         # with
                                        
                if cnt == 0: continue

                #if jointk > 0:
                #    print "MI", cluster, ck, jointk, cnt, ann_marginal, cnt / N * log( cnt / N
                #                                                                       / (ck / N)
                #                                                                       / (ann_marginal / N), 2)
                
                mi += cnt / N * log( cnt / N
                                     / (ck / N)
                                     / (ann_marginal / N), 2)
                
        return mi

    def mutual_information_specific_array(self):
        """Return a tuple of:
(1) a length |annotations| array Mutual Information of each domain,

(2) a |clusters|X|domains| matrix of the domain-containing
    contribution to SMI for each cluster and domain,

(3) a |clusters|X|domains| matrix of the domain-absence
    contribution to SMI for each cluster and domain,

(4) A dictionary mapping clusters to indicies in these arrays,

(5) A dictionary mapping domains to indicies"""

        # FIXME: This uses quite a lot of memory...
        
        N = len(self.seq_to_cluster)
        JOINT = numpy.zeros( (len(self.cluster_to_seq), len(self.annotation_to_seq)),
                           dtype=numpy.int)
        CK = numpy.zeros( (len(self.cluster_to_seq),1), dtype=numpy.int)

        # array indicies
        ann_index = dict( zip( sorted(self.annotation_to_seq.keys()),
                               range(len(self.annotation_to_seq))
                               ))
        cl_index = dict( zip( sorted(self.cluster_to_seq.keys()),
                              range(len(self.cluster_to_seq))
                              ))

        # cluster sums
        for cluster in self.cluster_to_seq:
            CK[ cl_index[cluster]] = len(self.cluster_to_seq[cluster])

        # joint counts
        for ann in self.annotation_to_seq:
            joint_cnt = self.joint_cluster_ann_count(ann)
            ann_i = ann_index[ann]

            for (cluster,cnt) in sorted(joint_cnt.items()):
                cluster_i = cl_index[cluster]

                JOINT[cluster_i, ann_i] = cnt

        # count of sequences without domain in each cluster
        # probably not needed, but convenient for thinking about
        nJOINT = CK - JOINT

        # domain counts
        DK = numpy.sum(JOINT, axis=0)
        nDK = numpy.sum(nJOINT, axis=0)

        # domain containing specific MI
        posMI = numpy.array(JOINT,dtype=numpy.double)
        posMI = posMI / N * numpy.log2( posMI / CK / DK * N)
        posMI = numpy.nan_to_num(posMI)
        
        # not domain containing specific MI
        negMI = numpy.array(nJOINT,dtype=numpy.double)
        negMI = negMI / N * numpy.log2( negMI / CK / nDK * N)
        negMI = numpy.nan_to_num(negMI)

        # mutual information of each domain.  Sum over all SMI of all clusters
        MI = numpy.sum(posMI + negMI, axis=0)

        return MI, posMI, negMI, cl_index, ann_index
        #return MI, cl_index, ann_index


    def mutual_information_allann(self):
        mi_dict = {}

        for ann in self.annotation_to_seq:
            mi_dict[ann] = self.mutual_information(ann)

        return mi_dict

    def format_table(self, annotation_label=None):

        all_entropy = {}
        all_mi = {}
        for domain in self.annotation_to_seq:
            all_entropy[domain] = self.annotation_entropy(domain)
            all_mi[domain] = self.mutual_information(domain)


        s = "Sequences: %d, Clusters: %d, Domains: %d\n" % (
            len(self.seq_to_cluster),
            len(self.cluster_to_seq),
            len(self.annotation_to_seq))
        s += "Clustering entropy: %g\n" % self.clustering_entropy()
        s += "      log(#seqs,2): %g\n" % log(len(self.seq_to_cluster),2)
        s += "\n"

        s += "Domain                      #dom #clust TotSize    Entropy Mutual Inf.\n"
        s += "----------------------------------------------------------------------\n"


        for (domain,minf) in sorted(all_mi.items(), key=lambda a:a[1], reverse=True):
            joint_cnt = self.joint_cluster_ann_count(domain)
            total_cluster_cnt = sum((len(self.cluster_to_seq[cluster])
                                     for cluster in joint_cnt))
            
            s += "%-11s %-16s%4d%7d%8d %0.4e %0.4e %0.3g\n" % (
                domain,
                annotation_label[domain] if annotation_label is not None else "",
                len(self.annotation_to_seq[domain]),
                len(joint_cnt),
                total_cluster_cnt,
                all_entropy[domain],
                minf,
                minf / all_entropy[domain])

        return s


if __name__ == "__main__":

    
    #pq = pfamq.pfamq()

    #annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

    # Full Mouse and human set

    # Just the human and mouse families
    if False:
        set_id = 109 # ppod human, mouse
        family_set_name = 'ppod_20_clean'
        #family_set_name = 'hgnc_apr10_selected'

        print "family_set_name: %s, set_id: %d\n" % (family_set_name,set_id)
        
        fq = familyq.familyq(family_set_name=family_set_name)
        clustering = fq.fetch_families()[0]
        
        annotation, seq_map = pq.fetch_domain_map(
            set_id = set_id,
            seq_ids = tuple(reduce(
            lambda a,b:a+b, [list(clustering[fam]) for fam in clustering])))


    # Full mouse and human set
    if False:
        set_id = 109   # ppod human, mouse
        cr_id = 472    # clustering using the full 600k set
        nc_score = 0.2 # near the optimal

        print "set_id: %(set_id)d, cr_id: %(cr_id)d, nc_score: %(nc_score)d" % locals()

        clusclass = cluster_sql.hcluster( cluster_run_id = cr_id,
                                          cacheq=True)
        clustering = clusclass.cut_tree( distance = 1-nc_score,
                                         set_id_filter = set_id)
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)
        

    if False:
        set_id = 109   # ppod human, mouse
        #cr_id = 381    # jaccard clustering, all
        cr_id = 383    # jaccard clustering, human, mouse

        print "jaccard clustering. set_id: %(set_id)d, cr_id %(cr_id)d" % locals()

        clusclass = cluster_sql.flatcluster( cluster_run_id = cr_id)
        clustering = clusclass.fetch_clusters()
        annotation, seq_map = pq.fetch_domain_map(set_id = set_id)

    # test
    clustering = {0: (0,1,2,3,4,5,6,7),
                  1: (8, 9, 10, 11, 12, 13)}
    annotation = {'a': (0,1,2,3),
                  'b': (4,5),
                  'c': (8, 9, 10, 11),
                  'd': (8, 9, 10, 11),
                  'e': (6,7, 12, 13)}

    mi = mutualinfo( clustering, annotation)
    
    
    #domain_entropy = {}
    #domain_mi = {}
    #for domain in annotation:
    #    domain_entropy[domain] = mi.annotation_entropy(domain)
    #    domain_mi[domain] = mi.mutual_information(domain)


    #annotation_label = {}
    #for domain in annotation:
    #    annotation_label[domain] = pq.lookup_domain_name(domain)

    #print mi.format_table( annotation_label)