#!/usr/bin/env python # Jacob Joseph # 11 Dec 2009 # Build an HTML page to navigate hierarchical clusters import sys, cProfile, time from JJcluster.cluster_obj import cobj from JJcluster.describe import describe class browser(describe): def __init__(self, cluster_run_id, cacheq=False, family_set_name=None): describe.__init__(self, cluster_run_id = cluster_run_id, clustering_type = 'hierarchical', cacheq = cacheq, family_set_name = family_set_name) def html_hierarchy(self, orgarg="", family_abbrev=None): # constrain left and right to the extents of the family in the # tree. if family_abbrev is not None: family_members = self.fq.fetch_family_seqs( family_abbrev) parent_id = self.CR.get_common_parent( seq_ids=family_members) parent_row = self.CR.get_cluster_row( parent_id) #parent_row = self.CR.get_cluster_row( parent_row['parent_id']) #parent_row = self.CR.get_cluster_row( parent_row['parent_id']) #parent_row = self.CR.get_cluster_row( parent_row['parent_id']) #parent_row = self.CR.get_cluster_row( parent_row['parent_id']) print "Common parent:", parent_id, parent_row print "Nodes: %d" % (parent_row['rgt']-parent_row['lft'],) root = self.CR.fetch_structure( left_lim=parent_row['lft'], right_lim=parent_row['rgt']) else: root = self.CR.fetch_structure() s = """\n""" s += self.html_run_header(orgarg=orgarg) s_prefix_sequence = """%(level)0.3d P """ s_cluster = """%(level)0.3d P L R Cluster %(cluster_id)d: Cluster Similarity: %(clustsim)0.4f, Size: %(num_nodes)d, Density: %(density)0.4f, J: %(J)0.4f, Edges: %(num_edges)d, Frac. Edges: %(frac_edges)0.4f, Mean: %(mean)0.4f(%(stdev)0.4f)
\n""" s_cluster_large = """%(level)0.3d P L R Cluster %(cluster_id)d: Cluster Similarity: %(clustsim)0.4f, Size: %(num_nodes)d, J: %(J)0.4f
\n""" # work around emacs lingering highlighting of quote in triple quotes " # (family_set, family_member) = (self.fq.family_sets, # self.fq.family_members) # stack of (level, cluster) tuples, where level is the amount # of indentation needed queue = [] queue.append( (0, root, None, None)) while len(queue) > 0: (level, clust, parent_size, parent_id) = queue.pop() #print clust.cluster_id() # We're at a leaf cluster if 1 == clust.right() - clust.left(): seq_id = clust.items()[0] s += s_prefix_sequence % {'level': level, 'space': level*4 + 35, 'parent_id': parent_id} s += self.html_sequence( seq_id) else: cluster_id = clust.cluster_id() #hit_dict, seq_set = self.fetch_cluster_hits(cluster_id) #edge_stats = self.cluster_stats(cluster_id, hit_dict=hit_dict) cluster_size = len( self.CR.fetch_cluster( cluster_id)) # Add children to the queue children = clust.items() # order so that smaller clusters (esp singletons) come first (i.e. last on the queue) children.sort(key=lambda a: a.right()-a.left(), reverse=True) for child in children: if isinstance(child, cobj): queue.append( (level + 1, child, cluster_size, cluster_id)) if parent_size is None: parent_size = cluster_size # edge statistics take a while to calculate, so # calculate them only for smaller clusters if cluster_size > 1000: s += s_cluster_large % {'level': level, 'space': level*4, 'cluster_id': cluster_id, 'clustsim': 1-clust.distance(), 'num_nodes': cluster_size, 'J': float(parent_size - cluster_size) / cluster_size, 'left_id': clust.items()[0].cluster_id(), 'right_id': clust.items()[1].cluster_id(), 'parent_id': parent_id, 'cr_id': self.CR.cr_id, 'orgarg': orgarg } else: edge_stats = self.cluster_stats(cluster_id) s += s_cluster % {'level': level, 'space': level*4, 'cluster_id': cluster_id, 'clustsim': 1-clust.distance(), 'num_nodes': cluster_size, 'num_edges': edge_stats['num_edges'] / 2, 'frac_edges': edge_stats['frac_edges'], 'density': edge_stats['density'], 'mean': edge_stats['mean'], 'stdev': edge_stats['stdev'], 'J': float(parent_size - cluster_size) / cluster_size, 'left_id': clust.items()[0].cluster_id(), 'right_id': clust.items()[1].cluster_id(), 'parent_id': parent_id, 'cr_id': self.CR.cr_id, 'orgarg': orgarg } s += "\n" return s class runparam: def __init__(self, br_id=None, nc_id=None, stype=None, set_id=None): self.br_id = br_id self.nc_id = nc_id self.stype = stype self.set_id = set_id if __name__ == "__main__": cr_id = int(sys.argv[1]) set_id = int(sys.argv[2]) family_set_name = sys.argv[3] family_abbrev = sys.argv[4] if len(sys.argv) == 5 else None # used to select a subtree date = time.strftime('%Y%m%d') # FIXME: workaround for not storing br_id, nc_id, set_id in a # queryable field # cr_id_map = { # 70: runparam(100, 746, 'nc_score', 105), # full set # 71: runparam(97, 750, 'nc_score', 105), # cluster # 72: runparam(97, 746, 'nc_score', 107), # full set blast, new jan10 set # 73: runparam(104, 777, 'nc_score', None), # 12 species, not symmetric # 74: runparam(104, 777, 'nc_score', 109), # human and mouse only, not symmetric # 75: runparam(104, 779, 'nc_score', ), # 12 species, symmetric # 76: runparam(105, 780, 'nc_score', ), # 48 species, not symmetric # 77: runparam(104, 779, 'nc_score', 109), # Human and mouse only. Symmetric # 78: runparam(104, 779, 'nc_score', 109), # Human and mouse only. Symmetric. Single linkage # 79: runparam(104, 779, 'nc_score', 111), # Yeast only. Symmetric # 80: runparam(104, 779, 'nc_score', 111), # Yeast only. Symmetric. Single linkage # 81: runparam(104, 779, 'nc_score', 112), # Human only. Symmetric. # 82: runparam(105, 781, 'nc_score', 112), # Human only. Symmetric. Not compositional # } b = browser(cluster_run_id = cr_id, family_set_name=family_set_name, cacheq=True) s = b.html_hierarchy(orgarg="&o=h", family_abbrev=family_abbrev) fd = open("figures/%s_browser_cr_id_%d_set_id_%d_%s_%s.html" % ( date, cr_id, set_id, family_set_name, family_abbrev), 'w') fd.write(s) fd.close()