#!/usr/bin/env python # Jacob Joseph # 19 June 2010 # Basis for exploring a clustering in varying ways, particularly as # HTML. For example, for hierarchical clustering, facilitate # generation of a "cluster browser", annotated by network statistics, # and family annotations. from JJcluster.cluster_sql import flatcluster, hcluster from DurandDB import familyq, blastq from JJnetstat.stathelper import nxstat from JJutil import pickler, rate import logging, os, numpy import networkx as NX class describe(object): def __init__(self, cluster_run_id, clustering_type, cacheq = False, pickledir = None, family_set_name = None, debug_level = None, # default is warning and above, # or whatever the root logger # was configured to have # elsewhere ): self.cacheq=cacheq if pickledir != None: self.pickledir = pickledir else: self.pickledir = os.path.expandvars('$HOME/tmp/pickles') self.log = logging.getLogger('JJcluster.describe') if debug_level is not None: logging.getLogger('').setLevel(debug_level) if clustering_type == 'hierarchical': self.CR = hcluster(cluster_run_id = cluster_run_id, cacheq = self.cacheq) else: self.CR = flatcluster(cluster_run_id = cluster_run_id, cacheq = self.cacheq) # reading queries from the disk is often slower than the database if family_set_name is not None: self.fq = familyq.familyq(family_set_name = family_set_name, cacheq=False) else: # FIXME: this will break the html building, where they depend on family functions self.fq = blastq.blastq(cacheq=False) def cluster_conductance(self, cluster_id): """Calculate the conductance of a cluster.""" cluster_seq_set = self.CR.fetch_cluster(cluster_id) full_seq_set = self.fq.fetch_seq_set(br_id=self.CR.br_id, nc_id=self.CR.nc_id, set_id=self.CR.set_id_filter) self.log.debug("Calculating conductance: %d (size: %d)" % ( cluster_id,len(cluster_seq_set))) boundary_sum = 0.0 a_s = 0.0 a_not_s = 0.0 # count edge boundary for query_seq in cluster_seq_set: d = self.fq.fetch_hits_dictarray( stype=self.CR.stype, br_id=self.CR.br_id, nc_id=self.CR.nc_id, symmetric=self.CR.symmetric, query_seq_id = query_seq, seq_id_0_set_id = self.CR.set_id_filter, # potentially faster if specified set_id_filter = self.CR.set_id_filter, self_hits=False) (hit_list, score_list) = d[query_seq] a_s += numpy.sum(score_list) for i,seq_id_1 in enumerate(hit_list): if seq_id_1 not in cluster_seq_set: boundary_sum += score_list[i] # FIXME: what do we do about having a symmetric matrix? for query_seq in (full_seq_set - cluster_seq_set): d = self.fq.fetch_hits_dictarray( stype=self.CR.stype, br_id=self.CR.br_id, nc_id=self.CR.nc_id, symmetric=self.CR.symmetric, query_seq_id = query_seq, seq_id_0_set_id = self.CR.set_id_filter, # potentially faster if specified set_id_filter = self.CR.set_id_filter, self_hits=False) # a sequence may have no hits if not query_seq in d: continue (hit_list, score_list) = d[query_seq] a_not_s += numpy.sum(score_list) if a_not_s > a_s: break conductance = (boundary_sum / a_s) if a_s > 0 else -1 self.log.debug("Conductance: %g, boundary: %g, a_s: %g, minimum a_not_s: %g" % ( conductance, boundary_sum, a_s, a_not_s)) return conductance def fetch_cluster_hits(self, cluster_id, stype=None): """Fetch the sequence (sub)network for all nodes in cluster_id.""" seq_set = self.CR.fetch_cluster(cluster_id) # Default to the stype of the clustering, but it may be useful # to be able to extract bit-scores after clustering with NC # scores if stype is None: stype = self.CR.stype self.log.debug("Fetching cluster: %d (stype: %s)" % (cluster_id, stype)) hit_dict = {} qrate = rate.rate(totcount=len(seq_set)) for query_seq in seq_set: qrate.increment() if qrate.count % 1000 == 0: self.log.info("%d %s", (cluster_id, qrate)) d = self.fq.fetch_hits_dictarray( stype=stype, br_id=self.CR.br_id, nc_id=self.CR.nc_id, symmetric=self.CR.symmetric, query_seq_id = query_seq, seq_id_0_set_id = self.CR.set_id_filter, # potentially faster if specified seq_set = seq_set, self_hits=False) assert len(d) <= 1, "d has more than one key for query_seq %s" % query_seq if query_seq in d: hit_dict[query_seq] = d[query_seq] return (hit_dict, seq_set) def network_stats(self, cluster_id, hit_dict=None): if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s" % (self.CR.cr_id, cluster_id)) if retval is not None: return retval if hit_dict is None: hit_dict = self.fetch_cluster_hits( cluster_id) self.log.debug("Calculating network stats: %d (size %d)", (cluster_id,len(hit_dict))) # build a networkx graph of all hits in the cluster G = NX.Graph() # add all edges G.add_edges_from( ( (node, nbr) for (node,(hit_list, score_list)) in hit_dict.iteritems() for nbr in hit_list )) net = nxstat(G=G, cacheq=True, unique_parameters="%s_%s" % (self.CR.cr_id, cluster_id)) net_stats = net.calc_statistics(omit_spl=True) if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s" % (self.CR.cr_id, cluster_id), retval = net_stats) return net_stats def cluster_stats(self, cluster_id, hit_dict=None, seq_set=None, stype=None): if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s_%s_%s" % (self.CR.cr_id, self.CR.set_id_filter, cluster_id, stype)) if retval is not None: return retval if hit_dict is None: hit_dict, seq_set = self.fetch_cluster_hits( cluster_id, stype=stype) # FIXME: Not currently symmetric. Repetitive, and likely # quite slow for very many clusters. Some values could be # calculated by traversing the tree. num_scores = 0 all_scores = numpy.empty([10], dtype=numpy.float64) # build an array of all scores within this set for (hit_list, score_list) in hit_dict.values(): upper_ind = num_scores + len(score_list) if len(all_scores) < upper_ind: all_scores.resize( [int(upper_ind*1.5)]) all_scores[num_scores:upper_ind] = score_list num_scores = upper_ind all_scores.resize([num_scores]) #self.log.debug("cluster_stats: %d %s (len %d)" % (cluster_id, all_scores, num_scores)) stats = {} stats['num_nodes'] = len(seq_set) stats['num_edges'] = len(all_scores) stats['frac_edges'] = float(stats['num_edges']) / ( stats['num_nodes'] * (stats['num_nodes'] -1)) if num_scores > 0 else -1 stats['min'] = numpy.min( all_scores) if num_scores > 0 else -1 stats['max'] = numpy.max( all_scores) if num_scores > 0 else -1 stats['mean'] = numpy.mean( all_scores) if num_scores > 0 else -1 stats['stdev'] = numpy.std( all_scores) if num_scores > 0 else -1 stats['density'] = stats['mean'] * len(all_scores) / ( len(seq_set) * (len(seq_set) -1)) if num_scores > 0 else -1 if self.cacheq: retval = pickler.cachefn(pickledir = self.pickledir, args = "%s_%s_%s" % (self.CR.cr_id, self.CR.set_id_filter, cluster_id), retval = stats) return stats def html_run_header(self, descr="", orgarg=""): s_table = """
Cluster Run: %(cr_id)d
%(cr_comment)s
%(cr_params)s
Score Type: %(stype)s
Database Set: %(set_id)d (%(num_seqs)d sequences)
%(set_name)s
%(set_descr)s
Clustering Query Set: %(q_set_id)d (%(q_num_seqs)d sequences)
%(q_set_name)s
%(q_set_descr)s
Blast Run: %(br_id)d
Query set %(blast_query_set)d
NC Run: %(nc_id)s
Background argument: %(orgarg)s
%(descr)s



\n""" blast_info = self.fq.fetch_blast_info_d( self.CR.br_id) (set_name, set_descr) = self.fq.fetch_set_info( blast_info['set_id']) if self.CR.set_id_filter is not None: clustering_set_id = self.CR.set_id_filter else: clustering_set_id = blast_info['query_set_id'] (q_set_name, q_set_descr) = self.fq.fetch_set_info( clustering_set_id) s = s_table % {'cr_id': self.CR.cr_id, 'stype': self.CR.stype, 'cr_comment': self.CR.cluster_comment, 'cr_params': self.CR.cluster_params, 'set_id': blast_info['set_id'], 'num_seqs': blast_info['num_sequences'], 'set_name': set_name, 'set_descr': set_descr, 'q_set_id': clustering_set_id, 'q_num_seqs': self.fq.fetch_set_size( clustering_set_id), 'q_set_name': q_set_name, 'q_set_descr': q_set_descr, 'br_id': self.CR.br_id, 'blast_query_set': blast_info['query_set_id'], 'nc_id': self.CR.nc_id, 'orgarg': orgarg, 'descr': descr } return s def cluster_info(self, cluster_id): """Return a dictionary of basic cluster information. For speed, include cluster_id, size, and any additional keys relevant to the specific clustering method used (e.g, cluster similarity for hierarchical).""" info_d = {'cluster_id': cluster_id} return def cluster_info_extended(self, cluster_id): """Include all from cluster_info, but also calculate cluster statistics. Slower, and best called only for smaller clusters.""" info_d = self.cluster_info(cluster_id) edge_stats = self.cluster_stats(cluster_id) return def html_cluster(self, cluster_id, do_netstats=False, orgarg=""): s_cluster = """ Cluster %(cluster_id)d: Cluster Similarity: %(clustsim)0.4f, Size: %(num_nodes)d, Density: %(density)0.4f, J: %(J)0.4f, Edges: %(num_edges)d, Frac. Edges: %(num_edges)d, Mean: %(mean)0.4f(%(stdev)0.4f)
\n""" s_cluster_large = """ Cluster %(cluster_id)d: Cluster Similarity: %(clustsim)0.4f, Size: %(num_nodes)d, J: %(J)0.4f
\n""" # work around bad emacs highlighting " if do_netstats: s = s_cluster_large % {'cluster_id': cluster_id, 'clustsim': 1-clust.distance(), 'num_nodes': cluster_size, 'J': float(parent_size - cluster_size) / cluster_size, 'cr_id': self.CR.cr_id, 'query_set_id': self.CR.set_id_filter, 'orgarg': orgarg } else: edge_stats = self.cluster_stats(cluster_id) s += s_cluster % {'level': level, 'space': level*4, 'cluster_id': cluster_id, 'clustsim': 1-clust.distance(), 'num_nodes': cluster_size, 'num_edges': edge_stats['num_edges'] / 2, 'frac_edges': float(edge_stats['num_edges']) / (cluster_size * (cluster_size -1)), 'density': edge_stats['density'], 'mean': edge_stats['mean'], 'stdev': edge_stats['stdev'], 'J': float(parent_size - cluster_size) / cluster_size, 'left_id': clust.items()[0].cluster_id(), 'right_id': clust.items()[1].cluster_id(), 'parent_id': parent_id, 'cr_id': self.cr_id, 'query_set_id': self.query_set_id, 'orgarg': orgarg } return s def html_sequence(self, seq_id): s_seq = """Seq: %(seq_id)d: %(name)s %(species)s %(family)s %(urls_str)s %(synonyms)s %(descr)s
\n""" s_url = """%s""" urls = [ s_url % (url,db) for (db,url) in self.fq.build_urls(seq_id).items()] if len(urls) > 0: urls_str = reduce(lambda a,b:a+', '+b, urls) else: urls_str = "" (synonyms, descr) = self.fq.fetch_fa_annotate( seq_id) family = self.fq.fetch_seq_family( seq_id) species = self.fq.fetch_taxonomy(seq_id) s = s_seq % {'seq_id': seq_id, 'name': self.fq.fetch_seq_acc( seq_id), 'urls_str': urls_str, 'species': species[1] if species is not None else '', 'family': family if family is not None else '', 'synonyms': synonyms, 'descr': descr} return s