clustering.py 7.2 KB
Newer Older
rictjo's avatar
rictjo 已提交
1 2
import sklearn.cluster as sc
import pandas as pd
rictjo's avatar
rictjo 已提交
3
import numpy as np
rictjo's avatar
rictjo 已提交
4 5 6 7 8
import sys

clustering_algorithm = None
clustering_algorithm = sc.KMeans(10) # CHOOSE SOMETHING YOU LIKE NOT THIS

rictjo's avatar
rictjo 已提交
9
class Cluster(object):
rictjo's avatar
rictjo 已提交
10
    def __init__( self, nbins=50, nclusters=-1 , use_ranks = False ) :
rictjo's avatar
rictjo 已提交
11 12 13 14
        from sklearn.cluster import KMeans
        from sklearn.decomposition import PCA
        from numpy import histogram2d
        from scipy.stats import rankdata
rictjo's avatar
rictjo 已提交
15 16
        self.use_ranks = use_ranks
        self.nclusters = nclusters
rictjo's avatar
rictjo 已提交
17
        self.nbins = nbins
rictjo's avatar
rictjo 已提交
18 19 20 21 22 23 24 25
        self.histogram2d = histogram2d
        self.KMeans = KMeans
        self.rankdata = rankdata
        self.pca_f = PCA(2)
        self.centroids_ = None
        self.labels_ = None
        self.df_ = None
        self.num_index_ = None
rictjo's avatar
rictjo 已提交
26
        self.components_ = None
rictjo's avatar
rictjo 已提交
27

rictjo's avatar
rictjo 已提交
28
    def approximate_density_clustering( self, df, nbins=None ) :
rictjo's avatar
rictjo 已提交
29 30 31
        #
        # GENES APPROX 20K OK SO APPROX 50 BINS
        # ANALYTES ON ROWS, SAMPLE POINTS ON COLUMNS
rictjo's avatar
rictjo 已提交
32 33
        if nbins is None :
            nbins = self.nbins
rictjo's avatar
rictjo 已提交
34
        self.df_= df
rictjo's avatar
rictjo 已提交
35 36 37
        frac_df = df
        if self.use_ranks :
            frac_df .apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
rictjo's avatar
rictjo 已提交
38
        self.pca_f.fit(frac_df.T.values)
rictjo's avatar
rictjo 已提交
39
        self.components_ = self.pca_f.components_
rictjo's avatar
rictjo 已提交
40
        vals,xe,ye = self.histogram2d(self.pca_f.components_[0],self.pca_f.components_[1],bins=nbins)
rictjo's avatar
rictjo 已提交
41 42 43 44 45 46
        mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
        svs = np.sqrt(svsx**2+svsy**2)
        #
        # IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
        # SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
        hits = vals>mvs+0.5*svs
rictjo's avatar
rictjo 已提交
47 48
        #
        xe_,ye_ = 0.5*(xe[:1]+xe[1:]) , 0.5*(ye[:1]+ye[1:])
rictjo's avatar
rictjo 已提交
49 50
        idx = np.where(hits); xi,yj = idx[0],idx[1]
        centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
rictjo's avatar
rictjo 已提交
51 52 53 54 55 56 57 58
        if self.nclusters == -1 :
            self.nclusters = len ( centroids )
        if self.nclusters  < len ( centroids ) :
            import heapq
            from scipy.spatial import distance as distance_
            a = distance_.cdist ( centroids, centroids, 'euclidean' )
            cent_idx = heapq.nlargest ( self.nclusters, range(len(a)), a.reshape(-1).__getitem__ )
            centroids = [ centroids[ idx ] for idx in cent_idx ]
rictjo's avatar
rictjo 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73

        kmeans = self.KMeans(len(centroids),init=np.array(centroids))
        kmeans.fit(self.pca_f.components_.T)
        centers = np.array(kmeans.cluster_centers_).T
        self.labels_ = kmeans.labels_
        self.centroids_ = centers
        self.analyte_dict_ = { c:[] for c in self.labels_ }
        [self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
        return ( self.analyte_dict_ )

    def write_gmt(self, filename = './cluster_file.gmt' ) :
        with open(filename,'w') as of :
            for k,v in self.analyte_dict_.items() :
                print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of )

rictjo's avatar
rictjo 已提交
74

rictjo's avatar
rictjo 已提交
75
class ManifoldClustering ( Cluster ) :
rictjo's avatar
0.2.25  
rictjo 已提交
76 77
    def __init__( self , nbins=50 ) :
        from sklearn.cluster import KMeans
rictjo's avatar
rictjo 已提交
78
        from sklearn.manifold import MDS, TSNE
rictjo's avatar
0.2.25  
rictjo 已提交
79 80 81 82 83 84
        from numpy import histogram2d
        from scipy.stats import rankdata
        self.nbins = nbins
        self.histogram2d = histogram2d
        self.KMeans = KMeans
        self.rankdata = rankdata
rictjo's avatar
rictjo 已提交
85 86 87
        self.mds  = MDS ( n_components=2 )
        self.tsne = TSNE ( n_components=2 )
        self.man = None
rictjo's avatar
0.2.25  
rictjo 已提交
88 89 90 91
        self.centroids_ = None
        self.labels_ = None
        self.df_ = None
        self.num_index_ = None
rictjo's avatar
rictjo 已提交
92
        self.components_ = None
rictjo's avatar
0.2.25  
rictjo 已提交
93

rictjo's avatar
rictjo 已提交
94 95 96 97 98
    def approximate_embedding( self, df, nbins=None , use_tsne=True ) :
        self.man = self.tsne
        if not use_tsne :
            self.man = self.mds
            print ( 'WARNING::SLOW AND WASTEFUL' )
rictjo's avatar
0.2.25  
rictjo 已提交
99 100 101 102
        if nbins is None :
            nbins = self.nbins
        self.df_= df
        frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
rictjo's avatar
rictjo 已提交
103
        self.components_ = np.array(self.man.fit_transform(frac_df.values)).T
rictjo's avatar
0.2.25  
rictjo 已提交
104 105
        vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins)
        mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
rictjo's avatar
rictjo 已提交
106
        svs = np.sqrt( svsx**2 + svsy**2 )
rictjo's avatar
0.2.25  
rictjo 已提交
107 108 109 110
        #
        # IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
        # SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
        hits = vals>mvs+0.5*svs
rictjo's avatar
rictjo 已提交
111
        #print(hits,vals)
rictjo's avatar
0.2.25  
rictjo 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125
        xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:])
        idx = np.where(hits); xi,yj = idx[0],idx[1]
        centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
        #
        kmeans = self.KMeans(len(centroids),init=np.array(centroids))
        kmeans.fit(self.components_.T)
        centers = np.array(kmeans.cluster_centers_).T
        self.labels_ = kmeans.labels_
        self.centroids_ = centers
        self.analyte_dict_ = { c:[] for c in self.labels_ }
        [self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
        return ( self.analyte_dict_ )


rictjo's avatar
rictjo 已提交
126 127
def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gmt' ) :
    labels = ca.fit_predict(df.values)
rictjo's avatar
rictjo 已提交
128 129 130 131 132 133
    llabs = [ l for l in labels ]; ulabs=set(llabs)
    with open(filename,'w') as of :
        for ulab in ulabs :
            analytes = df.iloc[llabs==ulab].index.values
            print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of )

rictjo's avatar
rictjo 已提交
134 135 136 137 138 139

def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False ,
                                    output_name = 'feature_clusters_output.csv' 
                                  ) :
    x_pc1 = CLUSTER.components_[0]
    y_pc2 = CLUSTER.components_[1]
rictjo's avatar
r2p  
rictjo 已提交
140
    L_C   = len(CLUSTER.centroids_[0])
rictjo's avatar
rictjo 已提交
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
    #
    # MAKE CLUSTER COLORS
    make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
    C0 = [255,255,255] ; cluster_colors = []
    #
    for i in CLUSTER.labels_ :
        C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255))
        cluster_colors.append(make_hex_colors(C0_))

    if not df is None :
        if add_synonyms :
            synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \
                else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \
                else df.index.values[i] for i in range(len(px))]
        else :
            synonyms = df.index.values
    #
    data = []
    for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] ,
                              [cluster_colors[cl] for cl in CLUSTER.labels_] ) :
        data.append([x,y,t,cl,co])
    clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color'])
    if not df is None :
        clustering_df.index =  df.index.values 
    clustering_df.to_csv( output_name , '\t' )
    return ( clustering_df )

rictjo's avatar
rictjo 已提交
168 169 170
if __name__ == '__main__' :
    #
    # TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
rictjo's avatar
0.2.25  
rictjo 已提交
171 172
    filename = './Diabetes_collapsed_symbols.gct'
    df_ = pd.read_csv(filename,'\t',index_col=0,header=2)
rictjo's avatar
rictjo 已提交
173 174 175 176
    ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]] 
    ddf .index = [idx.split('/')[0] for idx in ddf.index]
    run_clustering_and_write_gmt( ddf , clustering_algorithm )
    #
rictjo's avatar
0.2.25  
rictjo 已提交
177
    CLU = Cluster()
rictjo's avatar
rictjo 已提交
178 179
    CLU.approximate_density_clustering(ddf)
    CLU.write_gmt()
rictjo's avatar
0.2.25  
rictjo 已提交
180