diff --git a/README.md b/README.md index e51cb84b8459ee5050338f724042200db2fa59f3..9976a9410ae1c86b8f2f08151e58e528b1e0edc0 100755 --- a/README.md +++ b/README.md @@ -79,5 +79,6 @@ print ( res_dfs ) # Manually updated code backups for this library : GitLab: https://gitlab.com/richardtjornhammar/impetuous + CSDN: https://codechina.csdn.net/m0_52121311/impetuous diff --git a/build/lib/impetuous/__init__.py b/build/lib/impetuous/__init__.py deleted file mode 100644 index a601aa027e850273956c673004bceaaf2bfb2fbf..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/__init__.py +++ /dev/null @@ -1 +0,0 @@ -name = "impetuous-gfa" diff --git a/build/lib/impetuous/clustering.py b/build/lib/impetuous/clustering.py deleted file mode 100644 index cf1949e86ca26843e5c91da21e2c05d1ad5cb698..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/clustering.py +++ /dev/null @@ -1,451 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -import pandas as pd -import numpy as np -import sys -import sklearn.cluster as sc - -try : - from numba import jit - bUseNumba = True -except ImportError : - bUseNumba = False - -# THE FOLLOWING KMEANS ALGORITHM IS THE AUTHOR OWN LOCAL VERSION -if bUseNumba : - @jit(nopython=True) - def seeded_kmeans( dat, cent ): - # - # PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN - # https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc - # AROUND LINE 2345 - # AGAIN CONSIDER USING THE C++ VERSION SINCE IT IS ALOT FASTER - # HERE WE SPEED IT UP USING NUMBA IF THE USER HAS IT INSTALLED AS A MODULE - # - NN , MM = np.shape ( dat ) - KK , LL = np.shape ( cent ) - if not LL == MM : - print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' ) - - labels = [ int(z) for z in np.zeros(NN) ] - w = labels - counts = np.zeros(KK) - tmp_ce = np.zeros(KK*MM).reshape(KK,MM) - old_error , error , TOL = 0. , 1. , 1.0E-10 - while abs ( error - old_error ) > TOL : - old_error = error - error = 0. - counts = counts * 0. - tmp_ce = tmp_ce * 0. - # START BC - for h in range ( NN ) : - min_distance = 1.0E30 - for i in range ( KK ) : - distance = np.sum( ( dat[h]-cent[i] )**2 ) - if distance < min_distance : - labels[h] = i - min_distance = distance - tmp_ce[labels[h]] += dat[ h ] - counts[labels[h]] += 1.0 - error += min_distance - # END BC - for i in range ( KK ) : - if counts[i]>0: - cent[i] = tmp_ce[i]/counts[i] - centroids = cent - return ( labels, centroids ) -else : - def seeded_kmeans( dat, cent ): - # - # SLOW SLUGGISH KMEANS WITH A DUBBLE FOR LOOP - # IN PYTHON! WOW! SUCH SPEED! - # - NN , MM = np.shape ( dat ) - KK , LL = np.shape ( cent ) - if not LL == MM : - print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' ) - labels = [ int(z) for z in np.zeros(NN) ] - w = labels - counts = np.zeros(KK) - tmp_ce = np.zeros(KK*MM).reshape(KK,MM) - old_error , error , TOL = 0. , 1. , 1.0E-10 - while abs ( error - old_error ) > TOL : - old_error = error - error = 0. - counts = counts * 0. - tmp_ce = tmp_ce * 0. - # START BC - for h in range ( NN ) : - min_distance = 1.0E30 - for i in range ( KK ) : - distance = np.sum( ( dat[h]-cent[i] )**2 ) - if distance < min_distance : - labels[h] = i - min_distance = distance - tmp_ce[labels[h]] += dat[ h ] - counts[labels[h]] += 1.0 - error += min_distance - # END BC - for i in range ( KK ) : - if counts[i]>0: - cent[i] = tmp_ce[i]/counts[i] - centroids = cent - return ( labels, centroids ) - -from scipy.spatial.distance import squareform , pdist -absolute_coordinates_to_distance_matrix = lambda Q:squareform(pdist(Q)) - -distance_matrix_to_geometry_conversion_notes = """ -*) TAKE NOTE THAT THE OLD ALGORITHM CALLED DISTANCE GEOMETRY EXISTS. IT CAN BE EMPLOYED TO ANY DIMENSIONAL DATA. HERE YOU FIND A SVD BASED ANALOG OF THAT OLD METHOD. - -*) PDIST REALLY LIKES TO COMPUTE SQUARE ROOT OF THINGS SO WE SQUARE THE RESULT IF IT IS NOT SQUARED. - -*) IN SHORT THE DISTANCE MATRIX IN THE CONVERSION ROUTINE BACK TO ABSOLUTE COORDINATES USES R2 DISTANCES. -""" - -if bUseNumba : - @jit(nopython=True) - def distance_matrix_to_absolute_coordinates ( D , bSquared = False, n_dimensions=2 ): - if not bSquared : - D = D**2. - DIM = n_dimensions - DIJ = D*0. - M = len(D) - for i in range(M) : - for j in range(M) : - DIJ[i,j] = 0.5* (D[i,-1]+D[j,-1]-D[i,j]) - D = DIJ - U,S,Vt = np.linalg.svd ( D , full_matrices = True ) - S[DIM:] *= 0. - Z = np.diag(S**0.5)[:,:DIM] - xr = np.dot( Z.T,Vt ) - return ( xr ) -else : - def distance_matrix_to_absolute_coordinates ( D , bSquared = False, n_dimensions=2 ): - if not bSquared : - D = D**2. - DIM = n_dimensions - DIJ = D*0. - M = len(D) - for i in range(M) : - for j in range(M) : - DIJ[i,j] = 0.5* (D[i,-1]+D[j,-1]-D[i,j]) - D = DIJ - U,S,Vt = np.linalg.svd ( D , full_matrices = True ) - S[DIM:] *= 0. - Z = np.diag(S**0.5)[:,:DIM] - xr = np.dot( Z.T,Vt ) - return ( xr ) - - -def connectivity ( B , val, bVerbose=True ) : - description=""" -This is a cutoff based clustering algorithm. The intended use is to supply a distance matrix and a cutoff value (then becomes symmetric positive definite). For a small distance cutoff, you should see all the parts of the system and for a large distance cutoff, you should see the entire system. It has been employed for statistical analysis work as well as the original application where it was employed to segment molecular systems. - """ - # PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN - # https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc - # AROUND LINE 2277 - # CONSIDER COMPILING AND USING THAT AS A MODULE INSTEAD OF THIS SINCE IT IS - # A LOT FASTER - # FOR A DESCRIPTION READ PAGE 30 (16 INTERNAL NUMBERING) of: - # https://kth.diva-portal.org/smash/get/diva2:748464/FULLTEXT01.pdf - # - nr_sq,mr_sq = np.shape(B) - if nr_sq != mr_sq : - print ( 'ERROR' ) - exit (1) - N = mr_sq - res , nvisi, s, NN, ndx, C = [], [], [], [], [], 0 - res .append(0) - for i in range(N) : - nvisi.append(i+1) - res.append(0); res.append(0) - ndx.append(i) - while ( len(ndx)>0 ) : - i = ndx[-1] ; ndx = ndx[:-1] - NN = [] - if ( nvisi[i]>0 ) : - C-=1 - for j in range(N) : - if ( B[i,j]0 ) : - # back pop_back - k = NN[-1]; NN = NN[:-1] - nvisi[k] = C - for j in range(N): - if ( B[j,k]mvs+0.5*svs - # - xe_,ye_ = 0.5*(xe[:1]+xe[1:]) , 0.5*(ye[:1]+ye[1:]) - idx = np.where(hits); xi,yj = idx[0],idx[1] - centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ] - if self.nclusters == -1 : - self.nclusters = len ( centroids ) - if self.nclusters < len ( centroids ) : - import heapq - from scipy.spatial import distance as distance_ - a = distance_.cdist ( centroids, centroids, 'euclidean' ) - cent_idx = heapq.nlargest ( self.nclusters, range(len(a)), a.reshape(-1).__getitem__ ) - centroids = [ centroids[ idx ] for idx in cent_idx ] - - kmeans = self.KMeans(len(centroids),init=np.array(centroids)) - kmeans.fit(self.pca_f.components_.T) - centers = np.array(kmeans.cluster_centers_).T - self.labels_ = kmeans.labels_ - self.centroids_ = centers - self.analyte_dict_ = { c:[] for c in self.labels_ } - [self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ] - return ( self.analyte_dict_ ) - - def write_gmt(self, filename = './cluster_file.gmt' ) : - with open(filename,'w') as of : - for k,v in self.analyte_dict_.items() : - print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of ) - - -class ManifoldClustering ( Cluster ) : - def __init__( self , nbins=50 ) : - from sklearn.cluster import KMeans - from sklearn.manifold import MDS, TSNE - from numpy import histogram2d - from scipy.stats import rankdata - self.nbins = nbins - self.histogram2d = histogram2d - self.KMeans = KMeans - self.rankdata = rankdata - self.mds = MDS ( n_components=2 ) - self.tsne = TSNE ( n_components=2 ) - self.man = None - self.centroids_ = None - self.labels_ = None - self.df_ = None - self.num_index_ = None - self.components_ = None - - def approximate_embedding( self, df, nbins=None , use_tsne=True ) : - self.man = self.tsne - if not use_tsne : - self.man = self.mds - print ( 'WARNING::SLOW AND WASTEFUL' ) - if nbins is None : - nbins = self.nbins - self.df_= df - frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) ) - self.components_ = np.array(self.man.fit_transform(frac_df.values)).T - vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins) - mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1) - svs = np.sqrt( svsx**2 + svsy**2 ) - # - # IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN - # SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES - hits = vals>mvs+0.5*svs - #print(hits,vals) - xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:]) - idx = np.where(hits); xi,yj = idx[0],idx[1] - centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ] - # - kmeans = self.KMeans(len(centroids),init=np.array(centroids)) - kmeans.fit(self.components_.T) - centers = np.array(kmeans.cluster_centers_).T - self.labels_ = kmeans.labels_ - self.centroids_ = centers - self.analyte_dict_ = { c:[] for c in self.labels_ } - [self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ] - return ( self.analyte_dict_ ) - - -def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gmt' ) : - labels = ca.fit_predict(df.values) - llabs = [ l for l in labels ]; ulabs=set(llabs) - with open(filename,'w') as of : - for ulab in ulabs : - analytes = df.iloc[llabs==ulab].index.values - print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of ) - - -def projection_knn_assignment ( projected_coords , df , NMaxGuess=-1 , n_dimensions=2 ) : - coords_s = projected_coords.dropna( 0 ) - centroid_coordinates = [] - for row in df.T : - guess = sorted ( [ (v,i) for (v,i) in zip( df.loc[row].values,df.loc[row].index ) ] ) [::-1][:NMaxGuess] - maxWeights = [ i[1] for i in guess ] - use = df.loc[row,maxWeights] - S = np.sum ( use.values ) - S = 1. if S==0 else S - crd = np.dot(use.values,coords_s.loc[use.index.values].values)/S - centroid_coordinates.append(crd) - - centroids_df = pd.DataFrame ( centroid_coordinates , index=df.index , columns=[ 'C'+str(i) for i in range(n_dimensions) ] ) - labels , centroids = seeded_kmeans( coords_s.values,centroids_df.values ) - coords_s.loc[:,'owner'] = centroids_df.iloc[labels].index.values - for i in range(len(centroids.T)) : - centroids_df.loc[:,'E'+str(i) ] = (centroids.T)[i] - return ( centroids_df , coords_s ) - - -def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False , - output_name = 'feature_clusters_output.csv' - ) : - x_pc1 = CLUSTER.components_[0] - y_pc2 = CLUSTER.components_[1] - L_C = len(CLUSTER.centroids_[0]) - # - # MAKE CLUSTER COLORS - make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256) - C0 = [255,255,255] ; cluster_colors = [] - # - for i in CLUSTER.labels_ : - C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255)) - cluster_colors.append(make_hex_colors(C0_)) - - if not df is None : - if add_synonyms : - synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \ - else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \ - else df.index.values[i] for i in range(len(px))] - else : - synonyms = df.index.values - data = [] - for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] , - [cluster_colors[cl] for cl in CLUSTER.labels_] ) : - data.append([x,y,t,cl,co]) - clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color']) - if not df is None : - clustering_df.index = df.index.values - clustering_df.to_csv( output_name , '\t' ) - return ( clustering_df ) - -def backprojection_clustering ( analyte_df , bRanked=False , n_dimensions=2 , - bDoFeatures=True , bDoSamples=True ) : - from scipy.stats import rankdata - if bRanked : - rana_df = analyte_df .apply( lambda x:(rankdata(x,'average')-0.5)/len(x) ) - else : - rana_df = analyte_df - - dimcrdnames = [ 'd'+str(i) for i in range(n_dimensions) ] - # - # Do backprojection clustering with backprojection - cluster_coords_f = None - if bDoFeatures : - # - dM1 = absolute_coordinates_to_distance_matrix( rana_df.values ) - pd.DataFrame(dM1,index=rana_df.index,columns=rana_df.index).to_csv('../data/dM1.tsv','\t') - # - # Project it back onto first two components - max_var_projection = distance_matrix_to_absolute_coordinates ( dM1 , n_dimensions=n_dimensions ) - cluster_coords_f = pd.DataFrame( max_var_projection , - columns = rana_df.index , - index = dimcrdnames ).T - cluster_coords_s = None - if bDoSamples : - # - # And again for all the samples - dM2 = absolute_coordinates_to_distance_matrix( rana_df.T.values ) - pd.DataFrame(dM2,index=rana_df.columns,columns=rana_df.columns).to_csv('../data/dM2.tsv','\t') - # - # This algorithm is exact but scales somewhere between n^2 and n log n - max_var_projection = distance_matrix_to_absolute_coordinates ( dM2 , n_dimensions=n_dimensions ) - cluster_coords_s = pd.DataFrame( max_var_projection , - columns = rana_df.columns , - index = dimcrdnames ).T - cluster_coords_s.to_csv('../data/conclust_s.tsv','\t') - - return ( cluster_coords_f,cluster_coords_s ) - - -if __name__ == '__main__' : - - if False : - # - # TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE - filename = './Diabetes_collapsed_symbols.gct' - df_ = pd.read_csv(filename,'\t',index_col=0,header=2) - ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]] - ddf .index = [idx.split('/')[0] for idx in ddf.index] - run_clustering_and_write_gmt( ddf , clustering_algorithm ) - # - CLU = Cluster( ) - CLU .approximate_density_clustering(ddf) - CLU .write_gmt() - - if False : - A = np.array( [ [0.00, 0.10, 0.10, 9.00, 9.00, 9.00], - [0.10, 0.00, 0.15, 9.00, 9.00, 9.00], - [0.10, 0.15, 0.00, 9.00, 9.00, 9.00], - [9.00, 9.00, 9.00, 0.00, 0.10, 0.10], - [9.10, 9.00, 9.00, 0.10, 0.00, 0.15], - [9.10, 9.00, 9.00, 0.10, 0.15, 0.00] ] ) - print( connectivity(A,0.01) ) - diff --git a/build/lib/impetuous/convert.py b/build/lib/impetuous/convert.py deleted file mode 100644 index 576b388950d701cfcea73b07392cdb5dbe3ba055..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/convert.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -Copyright 2019 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import pandas as pd -import numpy as np -import networkx as nx -from networkx.readwrite import json_graph -import json -import sys - -def drop_duplicate_indices( df ): - df_ = df.loc[~df.index.duplicated(keep='first')] - return df_ - -def write_tree( tree , outfile='tree.json' ): - root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ] - o_json = json_graph.tree_data( tree , root ) - if not outfile is None: - with open(outfile, 'w') as o_file: - json.dump(o_json, o_file ) - return( o_json ) - -def add_attributes_to_tree ( p_df , tree ): - id_lookup = { rid:lid for (lid,rid) in nx.get_node_attributes(tree,'source').items() } - add_attributes = p_df.columns.values - for attribute in add_attributes: - propd = { id_lookup[idx]:{attribute:val} for (idx,val) - in zip(p_df.index.values,p_df.loc[:,attribute]) if idx in set(id_lookup.keys()) } - nx.set_node_attributes( tree , propd ) - return( tree ) - -def parent_child_to_dag ( - relationship_file = './PCLIST.txt' , - i_p = 0 , i_c = 1 - ) : - n_df = pd .read_csv ( relationship_file , '\t' ) - pair_tuples = [ (p,c) for (p,c) in zip(n_df.iloc[:,i_p],n_df.iloc[:,i_c]) ] - children_of = {} ; all_names = set([]) - for ( p,c ) in pair_tuples : - if not 'list' in str(type(c)): - C = [c] - all_names = all_names | set([p]) | set(C) - if p in children_of : - children_of[p] .append(c) - else : - children_of[p] = [c] - G = nx .DiGraph() - G .add_nodes_from ( all_names ) - G .add_edges_from ( pair_tuples ) - tree = nx.algorithms.dag.dag_to_branching( G ) - root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ] - descendants = [ ( idx , nx.algorithms.dag.descendants(G,idx) ) for idx in all_names ] - ancestors = [ ( idx , nx.algorithms.dag.ancestors(G,idx) ) for idx in all_names ] - return ( tree,ancestors,descendants ) - -def make_pathway_ancestor_data_frame(ancestors): - p_df = None - for k,v in ancestors : - t_df = pd.DataFrame([[','.join(list(v)),len(v)]],index=[k],columns=['DAG,ancestors','DAG,level']) - if p_df is None : - p_df = t_df - else : - p_df = pd.concat([p_df,t_df]) - return( p_df ) - -def normalise_for_apples_and_oranges_stats( X , method='ordinal' ): - X_ = rankdata( X , method=method )/len(X) - return(X_) - -def make_group_analytes_unique( grouping_file , delimiter='\t' ): - uniqe_grouping_file = '/'.join(grouping_file.split('/')[:-1]) + '/unique_'+grouping_file.split('/')[-1] - with open( uniqe_grouping_file , 'w' ) as of: - with open( grouping_file ) as input : - for line in input : - vline = line.replace('\n','').split(delimiter) - gid, gdesc, analytes_ = vline[0], vline[1], list(set(vline[2:])) - nvec = [gid,gdesc] ; [ nvec.append(a) for a in analytes_ ] - print ( delimiter.join(nvec) , file = of ) - -def read_conversions(file_name) : - gene2ens = {} ; non_unique = [] - with open( file_name , 'r' ) as infile: - if sys.version_info[0] < 3: - infile.next() - else : - next(infile) - for line in infile: - words = line.strip().split('\t') - if len(words)==2 : - ens, gene = words - if gene in gene2ens: - gene2ens[gene].append(ens) - else : - gene2ens[gene] = [ens] - return gene2ens - -def read_gene_ensemble_conversion(file_name): - gene2ens = {} ; non_unique = [] - with open(file_name,'r') as infile: - if sys.version_info[0] < 3: - infile.next() - else: - next(infile) - for line in infile: - words = line.strip().split('\t') - if len(words)==2 : - ens, gene = words - if gene in gene2ens: - non_unique.append((gene,ens,gene2ens[gene])) - else : - gene2ens[gene] = ens - if len(non_unique)>0: - print(' WARNING ' ) - print( 'FOUND ', len(non_unique), ' NON UNIQUE ENTRIES' ) - return gene2ens - -def create_synonyms( convert_file , unique_mapping=False ): - # CREATE SYNONYMS - ens2sym , sym2ens = {} , {} - if unique_mapping: - sym2ens = read_gene_ensemble_conversion( convert_file ) - ens2sym = { v:k for k,v in sym2ens.items() } - else : - sym2ens_list = read_conversions( convert_file ) - ens2sym_list = {} - for s,L in sym2ens_list.items() : - for e in L: - if e in ens2sym_list: - ens2sym_list.append(s) - else: - ens2sym_list[e] = [s] - ens2sym = ens2sym_list - sym2ens = sym2ens_list - return ( ens2sym , sym2ens ) - -def flatten_dict( s2e ) : - # FORCES FIRST ELEMENT - ndict = {} - for (s,e) in s2e.items() : - if 'list' in str(type(e)) : - ndict[s] = e[0] - else : - ndict[s] = e - return ( ndict ) - -def convert_rdata_to_dataframe ( filename ) : - # - from rpy2.robjects import r as R - from rpy2.robjects.packages import importr - from rpy2.robjects import pandas2ri - from rpy2.robjects.conversion import localconverter - import rpy2.robjects as ro - # - print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' ) - rd_ = R.load( filename ) - if 'matrix' in str( type( R[rd_[0]] ) ).lower() : - column_names = [ R[rd_[0]].colnames ] - index_names = [ R[rd_[0]].rownames ] - else : - column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]] - index_names = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]] - # - pandas2ri.activate() - # - # SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO - # A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING - # - rd = R.load( filename ) - raw_df_l = [] - if 'ndarray' in str( type( R[rd[0]] ) ).lower() : - [ raw_df_l.append( R[rd[0]] ) ] - else : - [ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ] - full_df_dict = {} ; i_ = 0 - for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) : - pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames ) - full_df_dict[i_] = pdf - i_ = i_ + 1 - pandas2ri.deactivate() - return ( full_df_dict ) - -import os -if __name__ == '__main__' : - # - bMOFA_data = True - if bMOFA_data : - import os - # os.system('mkdir ../data') - # os.system('wget https://github.com/bioFAM/MOFAdata/blob/master/data/CLL_data.RData') - # os.system('mv CLL_data.RData ../data/.') - - df_dict = convert_rdata_to_dataframe( filename = '../data/CLL_data.RData' ) - pruned_df = df_dict[2].T.dropna().T - journal_df = df_dict[3].loc[['IGHV'],:] - mask = [ v>=0 for v in journal_df.loc['IGHV'].values ] - journal_df = journal_df.iloc[ :,mask ] - use = list(set(pruned_df.columns)&set(journal_df.columns)) - analyte_df = pruned_df .loc[ :,use ].apply( lambda x:np.log2(1+x) ) - journal_df = journal_df .loc[ :,use ] - - print ( analyte_df,journal_df ) - from impetuous.quantification import * - qdf = quantify_groups ( analyte_df , journal_df , 'anova~C(IGHV)' , '~/Work/data/naming_and_annotations/reactome/reactome.gmt' ) - print(qdf) - exit(1) - - base = '../../../data/' - convert_file = base + 'naming_and_annotations/conv.txt' - ens2sym , sym2ens = create_synonyms( convert_file ) - s2e = { **flatten_dict(sym2ens) } - - name,col,sep = 'Networks.nfo',0,'\t' - with open(name,'r') as input: - for line in input: - gname = line.split(sep)[col].replace('\n','') - if gname in s2e: - print(gname) - else: - print('MISSING:',gname) diff --git a/build/lib/impetuous/fit.py b/build/lib/impetuous/fit.py deleted file mode 100644 index 77a5a7792df5937f13c3c38afcfbc36b02f871d1..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/fit.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import numpy as np -import pandas as pd - -def read_xyz(name='data/naj.xyz',header=2,sep=' '): - mol_str = pd.read_csv(name,header=header) - P=[] - for i_ in range(len(mol_str.index)): - line = mol_str.iloc[i_,:].values[0] - lsp = [l.replace(' ','') for l in line.split(sep) if len(l)>0] - P.append(lsp) - pdf = pd.DataFrame(P); pdf.index=pdf.iloc[:,0].values ; pdf=pdf.iloc[:,1:4] - return(pdf.apply(pd.to_numeric)) - -def KabschAlignment( P,Q ): - # - # https://en.wikipedia.org/wiki/Kabsch_algorithm - # C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc - # IN VINCINITY OF LINE 524 - # - N,DIM = np.shape( P ) - M,DIM = np.shape( Q ) - if DIM>N or not N==M : - print( 'MALFORMED COORDINATE PROBLEM' ) - exit( 1 ) - - q0 , p0 = np.mean(Q,0) , np.mean(P,0) - cQ , cP = Q - q0 , P - p0 - - H = np.dot(cP.T,cQ) - I = np.eye( DIM ) - - U, S, VT = np.linalg.svd( H, full_matrices=False ) - Ut = np.dot( VT.T,U.T ) - I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1 - ROT = np.dot( VT.T,np.dot(I,U.T) ) - B = np.dot(ROT,P.T).T + q0 - np.dot(ROT,p0) - - return ( B ) - - -def WeightsAndScoresOf( P , bFA=False ) : - p0 = np.mean( P,0 ) - U, S, VT = np.linalg.svd( P-p0 , full_matrices=False ) - weights = U - if bFA : - scores = np.dot(S,VT).T - return ( weights , scores ) - scores = VT.T - return ( weights , scores ) - -def ShapeAlignment( P, Q , - bReturnTransform = False , - bShiftModel = True , - bUnrestricted = False ) : - # - # [*] C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc - # FIND SHAPE FIT FOR A SIMILIAR CODE IN THE RICHFIT REPO - # - description = """ - A NAIVE SHAPE FIT PROCEDURE TO WHICH MORE SOPHISTICATED - VERSIONS WRITTEN IN C++ CAN BE FOUND IN MY C++[*] REPO - - HERE WE WORK UNDER THE ASSUMPTION THAT Q IS THE MODEL - SO THAT WE SHOULD HAVE SIZE Q < SIZE P WITH UNKNOWN - ORDERING AND THAT THEY SHARE A COMMON SECOND DIMENSION - - IN THIS ROUTINE THE COARSE GRAINED DATA ( THE MODEL ) IS - MOVED TO FIT THE FINE GRAINED DATA ( THE DATA ) - """ - - N,DIM = np.shape( P ) - M,DIM = np.shape( Q ) - W = (N=M)*M - - if (DIM>W or N 0)-1 - ROT = np.dot( VT.T,np.dot(I,U.T) ) - if bReturnTransform : - return ( ROT,q0,p0 ) - - if bShiftModel :# SHIFT THE COARSE GRAINED DATA - B = np.dot(ROT,Q.T).T +p0 - np.dot(ROT,q0) - else : # SHIFT THE FINE GRAINED DATA - B = np.dot(ROT,P.T).T +q0 - np.dot(ROT,p0) - - return ( B ) - - -def low_missing_value_imputation ( fdf , fraction = 0.9 , absolute = 'True' ) : - # THIS SVD BASED IMPUTATION METHOD WAS FIRST WRITTEN FOR THE RANKOR PACKAGE - # ORIGINAL CODE IN https://github.com/richardtjornhammar/rankor/blob/master/src/rankor/imputation.py - # - import numpy as np - # - # fdf is a dataframe with NaN values - # fraction is the fraction of information that should be kept - # absolute is used if the data is positive - # - V = fdf.apply(pd.to_numeric).fillna(0).values - u,s,vt = np.linalg.svd(V,full_matrices=False) - s = np.array( [ s[i_] if i_ 0 : - used_analytes[node] = ','.join( group.index.values ) - pv,odds = group_significance( group , AllAnalytes=AllAnalytes, SigAnalytes=SigAnalytes , tolerance = threshold , alternative=alternative ) - node_sig[node] = pv ; marked_ = set( group.index.values ) - ancestors = df.loc[node,ancestors_id_label].replace('\n','').replace(' ','').split(item_delimiter) - if alexa_elim and pv > threshold : # USE ALEXAS ELIM ALGORITHM : IS NOT DEFAULT - continue - for u in ancestors : - if u in marked_analytes : - us = marked_analytes[u] - marked_analytes[u] = us | marked_ - else : - marked_analytes[u] = marked_ - df['Hierarchal,p'] = [ node_sig[idx] if idx in node_sig else 1. for idx in df.index.values ] - df['Included analytes,ids'] = [ used_analytes[idx] if idx in used_analytes else '' for idx in df.index.values ] - df = df.dropna() - return ( df ) diff --git a/build/lib/impetuous/impetuous.py b/build/lib/impetuous/impetuous.py deleted file mode 100644 index 6aa73728dfefd65f78c13ec62c5bb5bd365aa785..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/impetuous.py +++ /dev/null @@ -1,21 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -from impetuous.clustering import * -from impetuous.quantification import * -import sys - -if __name__ == '__main__' : - inventors__ = "Richard Tjörnhammar and Edward Tjörnhammar" diff --git a/build/lib/impetuous/pathways.py b/build/lib/impetuous/pathways.py deleted file mode 100644 index 4f43df744497929f9b1f5346e6e498980ca53b23..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/pathways.py +++ /dev/null @@ -1,370 +0,0 @@ -""" -Copyright 2019 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -import pandas as pd -import numpy as np -import re -from impetuous.convert import read_conversions, read_gene_ensemble_conversion, create_synonyms - -def find_compartment( istr ) : - return ( re.findall( r'\[(.*?)\]', istr ) ) - -class Group( object ) : - def get_next(self): - pass - -## PATHWAY SECTION -class GenericPathway( Group ) : - def __init__( self , path , gene_name_start = "ENSG0" , gene_mapping = None, # GENE ID MAPPING - is_own_pathway = False, list_like = False , add_pathway_prefix='', - gene_pos=0 , pathway_pos=1 , pathway_desc_pos=3, seperator='\t' ) : - self.file = path - self.prefered_genes = gene_name_start - self.pathways , self.pathway_names = {},{} - self.sep = seperator - self.is_own_pathway = is_own_pathway - self.gene_mapping = gene_mapping - self.gene_pos = gene_pos - self.pathway_pos = pathway_pos - self.pathway_desc_pos = pathway_desc_pos - self.list_like = list_like - self.add_pathway_prefix = add_pathway_prefix - self.replace_pair = None - self.internal_table = None - self.generator = None - self.active_read = False - self.restrict_id_to = None - if 'str' in str(type(self.file)): - self.read_and_parse() - else : - self.parse_df() - - def add_pathway_synonyms ( self , synonym_dict , prefix='' ) : - for pathway,pathway_name,genes in self.get_generator() : - synonyms = [] - for g in genes : - if g in synonym_dict or prefix+g in synonym_dict : - k = prefix * ( not g in synonym_dict ) + g - sdg = synonym_dict[k] - if 'list' in str(type(sdg)) : - [ synonyms.append(s) for s in sdg ] - if 'str' in str(type(sdg)) : - synonyms.append(sdg) - [ self.pathways[pathway].append(s) for s in synonyms ] - - def make_gmt_pathway_file ( self , filename , verbose=False , delimiter='\t', gene_mapping=None ): - #if 'None' in str(type( self.generator )) : - # self.generator = self.get_generator() - if not gene_mapping is None: - self.gene_mapping = gene_mapping - if 'str' in str(type(filename)) : - if 'str' in str( type( filename ) ) : - with open( filename, 'w' ) as o_file : - for pathway, pathway_name, genes in self.get_generator() : - row = list() ; row . append ( pathway ) - row . append ( pathway_name ) - genes_loc = genes - if 'dict' in str(type(self.gene_mapping)) : - genes_loc = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ] - [ row.append ( gene ) for gene in list(set(genes_loc)) ] - row_str = delimiter.join(row) - print(row_str) - o_file . write( row_str+'\n' ) - else : - print ( 'MUST SUPPLY A FILENAME' ) - - def make_internal_table(self, verbose=False, delimiter='\t', output_file=None ) : - self.internal_table = list() - generator = self.get_generator( ) - for pathway, pathway_name, genes in generator : - row = list() ; row . append ( pathway ) - row .append ( pathway_name ) - [ row .append ( gene ) for gene in genes ] - row_str = delimiter.join(row) - self.internal_table.append( row ) - - def parse_df(self): - if not self.is_own_pathway: - print('ERROR: OPERATION NOT SUPPORTED') - exit(1) - sfcv = set( self.file.columns.values ) - if 'gene' in sfcv and 'pathway' in sfcv and 'description' in sfcv: - print ( self.file.columns ) - else: - genes = self.file.index.values - for gene in genes : - pathway = self.add_pathway_prefix + gene - self.pathways[pathway] = [gene] - if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys() : - self.pathway_names[pathway] = self.gene_mapping[gene] - else : - self.pathway_names[pathway] = gene - - def get_generator_from_df(self): - self.parse_df() - for key in self.pathways.keys(): - yield(key,self.pathway_names[key],self.pathways[key]) - - def read_and_parse(self): - with open(self.file) as input: - pathway, pathway_name, genes = "", "", [] - for line in input: - lspl = line.split('\t') - if not 'None' in str(type(self.replace_pair)): - pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] ) - else: - pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ) - pathway_name = lspl[self.pathway_desc_pos] - if self.list_like : - # LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING - # HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS - genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ] - if not 'None' in str(type(self.gene_mapping)) : - renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ] - genes = renamed_genes - if pathway in self.pathways : - [ self.pathways[pathway].append(gene) for gene in genes ] - else : - self.pathways[pathway] = genes - self.pathway_names[pathway] = pathway_name - else : - if not line.startswith(self.prefered_genes) or len(line)==0: - continue - gene = lspl[ self.gene_pos ] - if self.is_own_pathway : - if gene in self.pathways : - continue; - else: - pway=self.add_pathway_prefix + gene - self.pathways[pway] = [gene] - if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys(): - self.pathway_names[pway] = self.gene_mapping[gene] - else: - self.pathway_names[pway] = gene - else : - if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys(): - gene = self.gene_mapping[gene] - if pathway in self.pathways: - self.pathways[pathway].append(gene) - else: - self.pathways[pathway] = [gene] - self.pathway_names[pathway] = pathway_name - - def dump_pathways(self): - return ( self.pathways,self.pathway_names ) - - def get_generator( self, verbose=False ): - if self.active_read : - if not self.file is None : - if 'str' in str(type(self.file)): - self.read_and_parse() - else: - self.parse_df() - if verbose : - print( self.dump_pathways() ) - for key in self.pathways.keys(): - yield( key , self.pathway_names[key] , self.pathways[key] ) - - -class Reactome( GenericPathway ) : - def __init__( self , path , gene_name_start = 'ENSG0' ,pathway_desc_pos=3, - gene_mapping = None, lexical_restrictions = None, # GENE ID MAPPING - is_own_pathway = False, restrict_id_to = None ) : - self.file = path - self.prefered_genes = gene_name_start - self.pathways , self.pathway_names ,self.pathway_compartments = {},{},{} - self.is_own_pathway = is_own_pathway - self.gene_mapping = gene_mapping - self.gene_pos = 0 - self.pathway_pos = 1 - self.pathway_desc_pos = pathway_desc_pos - self.list_like = False - self.add_pathway_prefix = '' - self.replace_pair = None - self.lexical_restrictions = lexical_restrictions - self.lexical_restriction_category_pos=None - self.skipstr = None - self.pathway_tag = None - self.restrict_id_to = restrict_id_to - self.active_read = False - if 'str' in str(type(self.file)): - self .read_and_parse(keep_str='ENSG0') - else: - self .parse_df() - - def read_and_parse( self , keep_str=None ) : - with open( self.file ) as input : - pathway, pathway_name, genes = "", "", [] - for line in input : - if not self.skipstr is None : - if line.startswith(self.skipstr): - continue - if not keep_str is None : - if not line.startswith(keep_str): - continue - lspl = line.split('\t') - if ('[' in line) and (']' in line) : - compartment = find_compartment( line )[0] - else : - compartment = '' - if not 'None' in str(type(self.replace_pair)) : - pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] ) - else : - pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ) - if not self.restrict_id_to is None : - if not pathway in self.restrict_id_to: - continue - pathway_name = lspl[ self.pathway_desc_pos ] - if not self.lexical_restrictions is None: - if not np.sum( [ int(lr in pathway_name) for lr in self.lexical_restrictions] )>0 : #or len(compartment)<4 - continue - if self.lexical_restriction_category_pos is None : - lex_restrict = pathway_name - else : - lex_restrict = lspl[ self.lexical_restriction_category_pos ] - if self.list_like : - # LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING - # HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS - genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ] - if not 'None' in str(type(self.gene_mapping)) : - renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ] - genes = renamed_genes - if pathway in self.pathways : - [ self.pathways[pathway].append(gene) for gene in genes ] - else : - self.pathways[pathway] = genes - self.pathway_names[pathway] = pathway_name - self.pathway_compartments[pathway] = compartment - else : - if not line.startswith(self.prefered_genes) or len(line)==0: - continue - gene = lspl[ self.gene_pos ] - if self.is_own_pathway : - if gene in self.pathways : - continue; - else: - pway = self.add_pathway_prefix + gene - self.pathways[pway] = [gene] - if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys(): - self.pathway_names[pway] = self.gene_mapping[gene] - else: - self.pathway_names[pway] = gene - else : - if not self.pathway_tag is None: - if not self.pathway_tag in pathway: - continue - if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys(): - gene = self.gene_mapping[gene] - if pathway in self.pathways: - self.pathways[pathway].append(gene) - else : - if True : # self.lexical_restrictions is None : - self.pathways[pathway] = [gene] - self.pathway_names[pathway] = pathway_name - else : - if len(set(self.lexical_restrictions)-set(lex_restrict.split()))0: - self.pathways[pathway] = [gene] - self.pathway_names[pathway] = pathway_name - - def add_pathway_synonyms( self , synonym_dict, prefix='' ) : - for pathway, pathway_name, genes in self.get_generator() : - synonyms = [] - for g in genes: - if g in synonym_dict or prefix+g in synonym_dict : - k = prefix * ( not g in synonym_dict ) + g - sdg = synonym_dict[ k ] - if 'list' in str(type(sdg)) : - [ synonyms.append(s) for s in sdg ] - if 'str' in str(type(sdg)) : - synonyms.append(sdg) - [ self.pathways[pathway].append(s) for s in synonyms ] - - - def read_extra_information ( self , file_name , valpos = 0 , keypos = 1 , - add_value_prefix = '' , required_content = 'HSA' , - add_desc_pos = None , comp_pos = None ) : - - with open ( file_name ) as input : - pathway, genes = "", [] - for line in input : - lspl = line.split( '\t' ) - pw = lspl[ keypos ] - if required_content in pw : - gn = add_value_prefix + lspl[ valpos ] - if not self.restrict_id_to is None : - if not pw in self.restrict_id_to : - continue - if pw in self.pathways : - self.pathways[pw] .append(gn) - else : - self.pathways[pw] = [gn] - self.pathway_names[ pw ] = '' - if not add_desc_pos is None : - if lspl[add_desc_pos] not in self.pathway_names[pw] : - self.pathway_names[pw] += lspl[add_desc_pos] + ', ' - if not comp_pos is None : - if self.pathway_compartments is None : - self.pathway_compartments = {} - if pw not in self.pathway_compartments : - self.pathway_compartments[pw] = [ ''.join(find_compartment(lspl[comp_pos])) ] - else : - self.pathway_compartments[pw] .append( ''.join(find_compartment(lspl[comp_pos])) ) - - def add_extra_information_from_dictionary( self , dictionary, map_onto_genes=True ): - for pathway in self.pathways : - genes = self.pathways[pathway] - add_these = [] - [ [ add_these.append(v) for v in dictionary[g] ] for g in genes if g in dictionary.keys() ] - if len(add_these)>0: - [ self.pathways[pathway].append(a) for a in add_these] - -def print_generator( generator , show_max_nr=3 , find_str=None ): - for pathway,pathway_name,genes in generator: - if not find_str is None: - if not find_str in pathway: - continue - print('Pathway: ' , pathway ) - print('Pathway name: ', pathway_name ) - print('Gene amount : ', len(genes), ' \t Gene name of first: ', genes[0] ) - print('Gene dump : ', genes ) - show_max_nr-=1 - if show_max_nr == 0: - break; - -def create_listdictionary_from_file( filename , delimiter = '\t' ,add_key_prefix='' ): - wanted_dictionary = dict() - with open( filename ) as input : - value, keyval, descriptor = "", "", [] - for line in input: - all_vals = line.replace('\n','').replace(' ','').split( delimiter ) - keyv = add_key_prefix+all_vals[0] - valv = all_vals[1:] - wanted_dictionary[keyv] = valv - return ( wanted_dictionary ) - -def flatten_generator(pathway_generator_object): - for pathway,genes in pathway_generator_object.pathways.items() : - ngenes = [] - [ ngenes.append(g) if 'str' in str(type(g)) else [ ngenes.append(q) for q in g ] for g in genes ] - pathway_generator_object.pathways[pathway] = list(set(ngenes)) - if pathway in pathway_generator_object.pathway_compartments: - pathway_generator_object.pathway_names[pathway] = '[' + \ - ','.join(list(set( pathway_generator_object.pathway_compartments[pathway] )) ) + \ - '] ' + pathway_generator_object.pathway_names[pathway] - -if __name__ == '__main__' : - print('ADD BACK REACTOME TEST') - diff --git a/build/lib/impetuous/probabilistic.py b/build/lib/impetuous/probabilistic.py deleted file mode 100644 index 2199fcf9073efba547e4d20d30ed7318f4275280..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/probabilistic.py +++ /dev/null @@ -1,125 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import numpy as np -import pandas as pd - -def hcurve ( x ) : - return ( x ) - -def D ( d , bLinear=False ) : - if bLinear : - return ( int(np.floor(3.5*d)) ) - if d==0 : - return ( 0 ) - else : - return ( D(d-1) + 2**d ) - -def cluster_probability_of_x ( X , - evidence , labels , centroids , - K = None , bLinear = False ) : - - ispanda = lambda P: 'pandas' in str(type(P)).lower() - BustedPanda = lambda R : R.values if ispanda(R) else R - - desc_ = """ - DOING THE SIMPLE PROBABILISTIC MODELLING FOR THE DATA - YOU CAN USE THIS TO DO VORONOI TESSELATION WITH K = 1 IN 2D - X IS A LIST OF COORDINATE POINTS TO EVALUATE THE FUNCTION ON - EVIDENCE IS A LIST OF ACTUAL EXPERIMENTAL DATA COORDINATES - LABELS ARE THE IMPUTED CLUSTER LABELS OF THE EXPERIMENTAL DATA POINTS - CENTROIDS ARE THE CENTROID COORDINATES - K IS THE NEIGHBOUR DIMENSION - BLINEAR SPECIFIES THAT ... - - labels , centroids = impq.knn_clustering_alignment ( P , Q ) - evidence = P.values - - THIS IS NOT THE SAIGA YOU ARE LOOKING FOR -RICHARD TJÖRNHAMMAR - """ - ev_ = BustedPanda ( evidence ) - N , DIM = np.shape( ev_ ) - if K is None : - K = D ( DIM , bLinear ) - # - # INPUT TYPE CHECKING - if True : - bFailure = False - if not ( 'array' in str(type(labels)) and len(np.shape(labels)) == 2 ) : - print("THE LABELS SHOULD BE 2D ARRAY OF DIMENSION N,1 "); bFailure=True - if not ( 'array' in str(type(ev_)) and len(np.shape(ev_)) == 2 ) : - print("EV_ SHOULD BE 2D ARRAY OF DIMENSION N,DIM "); bFailure=True - if not ( 'array' in str(type(centroids)) and len(np.shape(centroids)) == 2 ) : - print("CENTROIDS SHOULD BE 2D ARRAY OF DIMENSION M,DIM "); bFailure=True - if not ( 'array' in str(type(X)) and len(np.shape(X)) == 2 ) : - print("X SHOULD BE 2D ARRAY OF DIMENSION P,DIM "); bFailure=True - - if bFailure : - print( desc_ ) - exit(1) - # - all_cluster_names = sorted(list(set( [ l[0] for l in labels] ))) - print ( all_cluster_names ) - # - Rdf = None - for x in X : - S = sorted([ (s,i) for (s,i) in zip( np.sum( (x - ev_)**2 , 1 ) , range(len(ev_)) ) ])[:K] - present_clusters = [ labels[s[1]][0] for s in S ] - pY = [] - for yi in all_cluster_names: - pY.append( np.sum(yi==present_clusters)/K ) - tdf = pd.DataFrame( [[*x,*pY]] , - columns = [ *['d'+str(i) for i in range(len(x))] , - *all_cluster_names ] ) - if Rdf is None : - Rdf = tdf - else : - Rdf = pd.concat([Rdf,tdf]) - - Rdf.index = range(len(X)) - probability_df = Rdf.copy() - - return ( probability_df ) - -import impetuous.quantification as impq -if __name__ == '__main__' : - # - # IF YOU REQUIRE THE DATA THEN LOOK IN : - # https://github.com/richardtjornhammar/RichTools - # WHERE YOU CAN FIND THE FILES USED HERE - # - if True : - print( D(1),D(2),D(3),D(4) ) - print( D(1,True),D(2,True),D(3,True),D(4,True) ) - - colors = {'H':'#777777','C':'#00FF00','N':'#FF00FF','O':'#FF0000','P':'#FAFAFA'} - Q = read_xyz( name='data/naj.xyz' , header=2 , sep=' ' ) - P = read_xyz( name='data/cluster0.xyz' , header=2 , sep='\t' ) # IF IT FAILS THEN CHECK THE FORMATING OF THIS FILE - - if True : - labels , centroids = impq.knn_clustering_alignment ( P , Q ) - pd.DataFrame( labels ) .to_csv( 'labels.csv' ,'\t' ) - pd.DataFrame( centroids).to_csv( 'centroids.csv','\t' ) - else : - labels = pd.read_csv('labels.csv','\t',index_col=0).values - centroids = pd.read_csv('centroids.csv','\t',index_col=0).values - - p = cluster_probability_of_x ( X = np.array( [ [-20.5, 24.6 , 84] , [10,10,10] ] ) , - evidence = P.values , - labels = labels.reshape(-1,1) , - centroids = centroids , - K = None ) - print ( p ) diff --git a/build/lib/impetuous/quantification.py b/build/lib/impetuous/quantification.py deleted file mode 100644 index 5acc07a37e1c46bfcdfe44a18e7209c827d68370..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/quantification.py +++ /dev/null @@ -1,1081 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -import numpy as np -import pandas as pd -from impetuous.convert import create_synonyms , flatten_dict -from scipy.stats import rankdata -from scipy.stats import ttest_rel , ttest_ind , mannwhitneyu -from scipy.stats.mstats import kruskalwallis as kruskwall -from sklearn.decomposition import PCA -import itertools - -def SubArraysOf ( Array,Array_=None ) : - if Array_ == None : - Array_ = Array[:-1] - if Array == [] : - if Array_ == [] : - return ( [] ) - return( SubArraysOf(Array_,Array_[:-1]) ) - return([Array]+SubArraysOf(Array[1:],Array_)) - -def permuter( inputs , n ) : - # permuter( inputs = ['T2D','NGT','Female','Male'] , n = 2 ) - return( [p[0] for p in zip(itertools.permutations(inputs,n))] ) - -def grouper ( inputs, n ) : - iters = [iter(inputs)] * n - return zip ( *iters ) - -def whiten_data ( Xdf ) : - # REMEMBER BOYS AND GIRLS THIS IS SOMETHING YOU MIGHT NOT WANT TO DO :) - mean_center = lambda x: x-np.mean(x,0) - X = Xdf.values - u , s , v = np.linalg.svd( mean_center(X),full_matrices=False ) - X_white = np.dot(X,np.dot( np.diag(s**-1),np.abs(v) )) # we don't know the sign - return ( pd.DataFrame( X_white,index=Xdf.index.values,columns=Xdf.columns ) ) - -import re -def find_category_variables( istr ) : - return ( re.findall( r'C\((.*?)\)', istr ) ) - -def encode_categorical( G = ['Male','Male','Female'] ): - # - # CREATES AN BINARY ENCODING MATRIX FROM THE SUPPLIED LIST - # USES A PANDAS DATAFRAME AS INTERMEDIATE FOR ERROR CHECKING - # THIS PUTS THE O IN OPLS (ORTHOGONAL) - # - ugl = list(set(G)) ; n = len(ugl) ; m = len(G) - lgu = { u:j for u,j in zip(ugl,range(n)) } - enc_d = pd.DataFrame( np.zeros(m*n).reshape(-1,n),columns=ugl ) - for i in range ( m ) : - j = lgu[G[i]] - enc_d.iloc[i,j] = 1 - return ( enc_d ) - -def create_encoding_journal( use_categories, journal_df ) : - encoding_df = None - for category in use_categories : - catvals = journal_df.loc[category].to_list() - cat_encoding = encode_categorical( catvals ) - cat_encoding.index = journal_df.columns.values - if encoding_df is None : - encoding_df = cat_encoding.T - else : - encoding_df = pd.concat([encoding_df,cat_encoding.T]) - return ( encoding_df ) - -def quantify_density_probability ( rpoints , cutoff = None ) : - # - # DETERMINE P VALUES - loc_pdf = lambda X,mean,variance : [ 1./np.sqrt(2.*np.pi*variance)*np.exp(-((x-mean)/(2.*variance))**2) for x in X ] - from scipy.special import erf as erf_ - loc_cdf = lambda X,mean,variance : [ 0.5*( 1. + erf_( (x-mean)/np.sqrt( 2.*variance ) ) ) for x in X ] - loc_Q = lambda X,mean,variance : [ 1. - 0.5*( 1. + erf_( (x-mean)/np.sqrt( 2.*variance ) ) ) for x in X ] - M_,Var_ = np.mean(rpoints),np.std(rpoints)**2 - # - # INSTEAD OF THE PROBABILTY DENSITY WE RETURN THE FRACTIONAL RANKS - # SINCE THIS ALLOWS US TO CALCULATE RANK STATISTICS FOR THE PROJECTION - corresponding_density = rankdata (rpoints,'average') / len(rpoints) # loc_pdf( rpoints,M_,Var_ ) - corresponding_pvalue = loc_Q ( rpoints,M_,Var_ ) - # - # HERE WE MIGHT BE DONE - if not cutoff is None : - resolution = 10. ; nbins = 100. - # - # ONLY FOR ASSESING - h1,h2 = np.histogram(rpoints,bins=int(np.ceil(len(rpoints)/resolution))) - bin_radius = 0.5 * ( h2[1:] + h2[:-1] ) - radial_density = np.cumsum( h1 )/np.sum( h1 ) # lt - # - # NOW RETRIEVE ALL DENSITIES OF THE RADII - tol = 1./nbins - corresponding_radius = np.min( bin_radius[radial_density > cutoff/nbins] ) - return ( corresponding_pvalue , corresponding_density, corresponding_radius ) - return ( corresponding_pvalue , corresponding_density ) - -def find_category_interactions ( istr ) : - all_cats = re.findall( r'C\((.*?)\)', istr ) - interacting = [ ':' in c for c in istr.split(')') ][ 0:len(all_cats) ] - interacting_categories = [ [all_cats[i-1],all_cats[i]] for i in range(1,len(interacting)) if interacting[i] ] - return ( interacting_categories ) - -def interpret_problem ( analyte_df , journal_df , formula , bVerbose=False ) : - # - # THE JOURNAL_DF IS THE COARSE GRAINED DATA (THE MODEL) - # THE ANALYTE_DF IS THE FINE GRAINED DATA (THE DATA) - # THE FORMULA IS THE SEMANTIC DESCRIPTION OF THE PROBLEM - # - interaction_pairs = find_category_interactions ( formula.split('~')[1] ) - add_pairs = [] - if len( interaction_pairs )>0 : - for pair in interaction_pairs : - journal_df.loc[ ':'.join(pair) ] = [ p[0]+'-'+p[1] for p in journal_df.loc[ pair,: ].T.values ] - add_pairs.append(':'.join(pair)) - use_categories = list(set(find_category_variables(formula.split('~')[1]))) - use_categories = [u for u in use_categories if 'C('+u+')' in set(formula.replace(' ','').split('~')[1].split('+'))] - use_categories = [ *use_categories,*add_pairs ] - # - if len( use_categories )>0 : - encoding_df = create_encoding_journal ( use_categories , journal_df ).T - else : - encoding_df = None - # - if bVerbose : - print ( [ v for v in encoding_df.columns.values ] ) - print ( 'ADD IN ANY LINEAR TERMS AS THEIR OWN AXIS' ) - # - # THIS TURNS THE MODEL INTO A MIXED LINEAR MODEL - add_df = journal_df.loc[ [c.replace(' ','') for c in formula.split('~')[1].split('+') if not 'C('in c],: ] - if len(add_df)>0 : - if encoding_df is None : - encoding_df = add_df.T - else : - encoding_df = pd.concat([ encoding_df.T , - journal_df.loc[ [ c.replace(' ','') for c in formula.split('~')[1].split('+') if not 'C(' in c] , : ] ]).T - return ( encoding_df ) - - -def calculate_alignment_properties ( encoding_df , quantx, quanty, scorex, - analyte_df = None , journal_df = None , - bVerbose = False , synonyms = None , - blur_cutoff = 99.8 , exclude_labels_from_centroids = [''] , - study_axii = None , owner_by = 'tesselation' ): - - if bVerbose : - print ( np.shape(encoding_df) ) - print ( np.shape(analyte_df) ) - print ( 'qx:',np.shape(quantx) ) - print ( 'qy:',np.shape(quanty) ) - print ( 'sx:',np.shape(scorex) ) - print ( 'WILL ASSIGN OWNER BY PROXIMITY TO CATEGORICALS' ) - - if analyte_df is None or journal_df is None: - print ( 'USER MUST SUPPLY ANALYTE AND JOURNAL DATA FRAMES' ) - exit(1) - # - # THESE ARE THE CATEGORICAL DESCRIPTORS - use_centroid_indices = [ i for i in range(len(encoding_df.columns.values)) if ( - encoding_df.columns.values[i] not in set( exclude_labels_from_centroids ) - ) ] - # - use_centroids = list( quanty[use_centroid_indices] ) - use_labels = list( encoding_df.columns.values[use_centroid_indices] ) - # - if owner_by == 'tesselation' : - transcript_owner = [ use_labels[ np.argmin([ np.sum((xw-cent)**2) for cent in use_centroids ])] for xw in quantx ] - sample_owner = [ use_labels[ np.argmin([ np.sum((yw-cent)**2) for cent in use_centroids ])] for yw in scorex ] - # - if owner_by == 'angle' : - anglular_proximity = lambda B,A : 1 - np.dot(A,B) / ( np.sqrt(np.dot(A,A))*np.sqrt(np.dot(B,B)) ) - transcript_owner = [ use_labels[ np.argmin([ anglular_proximity(xw,cent) for cent in use_centroids ])] for xw in quantx ] - sample_owner = [ use_labels[ np.argmin([ anglular_proximity(yw,cent) for cent in use_centroids ])] for yw in scorex ] - # - # print ( 'PLS WEIGHT RADIUS' ) - radius = lambda vector:np.sqrt(np.sum((vector)**2)) # radii - # - # print ( 'ESTABLISH LENGTH SCALES' ) - xi_l = np.max(np.abs(quantx),0) - # - rpoints = np.array( [ radius( v/xi_l ) for v in quantx ] ) # HERE WE MERGE THE AXES - xpoints = np.array( [ radius((v/xi_l)[0]) for v in quantx ] ) # HERE WE USE THE X AXES - ypoints = np.array( [ radius((v/xi_l)[1]) for v in quantx ] ) # HERE WE USE THE Y AXES - # - # print ( 'ESTABLISH PROJECTION OF THE WEIGHTS ONTO THEIR AXES' ) - proj = lambda B,A : np.dot(A,B) / np.sqrt( np.dot(A,A) ) - # - # ADDING IN ADDITIONAL DIRECTIONS - # THAT WE MIGHT BE INTERESTED IN - if 'list' in str( type( study_axii ) ): - for ax in study_axii : - if len( set( ax ) - set( use_labels ) ) == 0 and len(ax)==2 : - axsel = np.array([ use_centroids[i] for i in range(len(use_labels)) if use_labels[i] in set(ax) ]) - axis_direction = axsel[0]-axsel[1] - use_labels .append( '-'.join(ax) ) - use_centroids .append( np.array(axis_direction) ) - - proj_df = pd.DataFrame( [ [ np.abs(proj(P/xi_l,R/xi_l)) for P in quantx ] for R in use_centroids ] , - index = use_labels , columns=analyte_df.index.values ) - # - # print ( 'P VALUES ALIGNED TO PLS AXES' ) - for idx in proj_df.index : - proj_p,proj_rho = quantify_density_probability ( proj_df.loc[idx,:].values ) - proj_df = proj_df.rename( index = {idx:idx+',r'} ) - proj_df.loc[idx+',p'] = proj_p - proj_df.loc[idx+',rho'] = proj_rho - # - # print ( 'THE EQUIDISTANT 1D STATS' ) - corresponding_pvalue , corresponding_density , corresponding_radius = quantify_density_probability ( rpoints , cutoff = blur_cutoff ) - # - # print ( 'THE TWO XY 1D STATS' ) - corr_pvalue_0 , corr_density_0 = quantify_density_probability ( xpoints ) - corr_pvalue_1 , corr_density_1 = quantify_density_probability ( ypoints ) - # - bOrderedAlphas = False - if True : - # DO ALPHA LEVELS BASED ON DENSITY - bOrderedAlphas = True - use_points = rpoints > corresponding_radius - ordered_alphas = [ float(int(u))*0.5 + 0.01 for u in use_points ] - - result_dfs = [] - # - # print ( 'COMPILE RESULTS FRAME' ) - for ( lookat,I_ ) in [ ( quantx , 0 ) , - ( scorex , 1 ) ] : - lookat = [ [ l[0],l[1] ] for l in lookat ] - if I_ == 1 : - aidx = journal_df.columns.values - else : - aidx = analyte_df.index.values - qdf = pd.DataFrame( [v[0] for v in lookat] , index=aidx , columns = ['x'] ) - qdf['y'] = [ v[1] for v in lookat ] - names = aidx - if I_ == 0 : - qdf[ 'owner' ] = transcript_owner - qdf['Corr,p' ] = corresponding_pvalue - qdf['Corr,r' ] = corresponding_density - qdf['Corr0,p'] = corr_pvalue_0 - qdf['Corr0,r'] = corr_density_0 - qdf['Corr1,p'] = corr_pvalue_1 - qdf['Corr1,r'] = corr_density_1 - qdf = pd.concat([qdf.T,proj_df]).T - if bOrderedAlphas : - qdf[ 'alpha' ] = ordered_alphas - else : - qdf['alpha'] = [ '0.3' for a in transcript_owner ] - else : - qdf['owner'] = sample_owner # The default should be the aligned projection weight - qdf['alpha'] = [ '0.2' for n in names ] - if synonyms is None : - qdf['name'] = names - else : - qdf['name'] = [ synonyms[v] if v in synonyms else v for v in names ] - result_dfs.append(qdf.copy()) - return ( result_dfs ) - - -def run_rpls_regression ( analyte_df , journal_df , formula , - bVerbose = False , synonyms = None , blur_cutoff = 99.8 , - exclude_labels_from_centroids = [''] , pls_components = 2, - bDeveloperTesting = False , - study_axii = None , owner_by = 'tesselation' - ) : - inventors__ = "Richard Tjörnhammar (RT) and Edward Tjörnhammar" - NOTE__ = "Edward Tjörnhammar (ET) early major contributor to this method. Inventors: "+inventors__+". RT is the main developer." - - encoding_df = interpret_problem ( analyte_df , journal_df , formula , bVerbose = bVerbose ) - from sklearn.cross_decomposition import PLSRegression as PLS - - if not bDeveloperTesting : - pls_components = 2 - - rpls = PLS( pls_components ) - rpls_res = rpls.fit( X = analyte_df.T.values , - Y = encoding_df .values ) - quantx,quanty = rpls_res.x_weights_ , rpls_res.y_weights_ - scorex = rpls_res.x_scores_ - - res_df = calculate_alignment_properties ( encoding_df , quantx, quanty, scorex, - journal_df = journal_df, analyte_df = analyte_df , blur_cutoff = blur_cutoff , - bVerbose = bVerbose, exclude_labels_from_centroids = exclude_labels_from_centroids , - study_axii = study_axii , owner_by = owner_by ) - - return ( res_df ) - - -import impetuous.fit as ifit -import impetuous.clustering as icluster -def run_shape_alignment_clustering ( analyte_df , journal_df , formula, bVerbose = False ) : - NOTE_ = "This is just a kmeans in arbitrary dimensions that start out with centroids that have been shape aligned" - encoding_df = interpret_problem ( analyte_df , journal_df , formula , bVerbose = bVerbose ) - - Q = encoding_df.T.apply( lambda x:(rankdata(x,'average')-0.5)/len(x) ).values - P = analyte_df .apply( lambda x:(rankdata(x,'average')-0.5)/len(x) ).values - - centroids = ifit.ShapeAlignment( P, Q , - bReturnTransform = False , - bShiftModel = True , - bUnrestricted = True ) - # - # FOR DIAGNOSTIC PURPOSES - centroids_df = pd.DataFrame ( centroids , - index = encoding_df.columns , - columns = encoding_df.index ) - lookup_ = {i:n for n,i in zip( centroids_df.index,range(len(centroids_df.index)) ) } - - labels , centroids = icluster.seeded_kmeans( P , centroids ) - - res_df = pd.DataFrame( [labels] , columns=analyte_df.index , index=['cluster index'] ) - nam_df = pd.DataFrame( [ lookup_[l] for l in labels ] , - columns=['cluster name'] , - index = analyte_df.index ).T - - res_df = pd.concat( [ res_df , nam_df ] ) - clusters_df = pd.concat( [ centroids_df, pd.DataFrame( res_df.T.groupby('cluster name').apply(len),columns=['size']) ] ,axis=1 ) - - return ( res_df , clusters_df ) - -def knn_clustering_alignment( P , Q ) : - - NOTE_ = "This is just a standard kmeans in arbitrary dimensions that start out with centroids that have been shape aligned" - ispanda = lambda P: 'pandas' in str(type(P)).lower() - BustedPanda = lambda R : R.values if ispanda(R) else R - P_ = BustedPanda ( P ) - Q_ = BustedPanda ( Q ) - - centroids = ifit .ShapeAlignment ( P_ , Q_ , - bReturnTransform = False , - bShiftModel = True , - bUnrestricted = True ) - - if ispanda ( Q ) : - # - # FOR DIAGNOSTIC PURPOSES - centroids_df = pd.DataFrame ( centroids , - index = Q.index , - columns = Q.columns ) - lookup_ = {i:n for n,i in zip( centroids_df.index,range(len(centroids_df.index)) ) } - - labels , centroids = icluster.seeded_kmeans( P_ , centroids ) - - if ispanda ( Q ) and ispanda ( P ) : - # - # MORE DIAGNOSTICS - res_df = pd.DataFrame( [labels] , columns=P.index , index=['cluster index'] ) - res_df .loc[ 'cluster name' ] = [ lookup_[l] for l in res_df.loc['cluster index'].values ] - print ( res_df ) - - return ( np.array(labels), np.array(centroids) ) - - - - -crop = lambda x,W:x[:,:W] -def run_shape_alignment_regression( analyte_df , journal_df , formula , - bVerbose = False , synonyms = None , blur_cutoff = 99.8 , - exclude_labels_from_centroids = [''] , - study_axii = None , owner_by = 'tesselation' , - transform = crop ) : - - NOTE__ = "Richard Tjörnhammars method that evolved as a synthesis of the work done together with Edward Tjörnhammar on the rpls method" - print ( 'WARNING: STILL UNDER DEVELOPMENT' ) - print ( 'WARNING: DEFAULT IS TO CROP ALIGNED FACTORS!!') - - encoding_df = interpret_problem ( analyte_df , journal_df , formula , bVerbose = bVerbose ) - - Q = encoding_df.T.apply( lambda x:(rankdata(x,'average')-0.5)/len(x) ).copy().values - P = analyte_df .apply( lambda x:(rankdata(x,'average')-0.5)/len(x) ).copy().values - - centroids = ifit.ShapeAlignment( P, Q , - bReturnTransform = False , - bShiftModel = True , - bUnrestricted = True ) - # - # FOR DIAGNOSTIC PURPOSES - centroids_df = pd.DataFrame ( centroids , - index = encoding_df.columns , - columns = encoding_df.index ) - - xws = ifit.WeightsAndScoresOf( P ) - yws = ifit.WeightsAndScoresOf( centroids ) - - W = np.min( [*np.shape(xws[0]),*np.shape(yws[0])] ) - - quantx = transform( xws[0],W ) - quanty = transform( yws[0],W ) - scorex = transform( xws[1],W ) - - res_df = calculate_alignment_properties ( encoding_df , quantx, quanty, scorex, - analyte_df = analyte_df.copy() , journal_df = journal_df.copy() , - blur_cutoff = blur_cutoff , bVerbose = bVerbose, - exclude_labels_from_centroids = exclude_labels_from_centroids , - study_axii = study_axii , owner_by = owner_by, synonyms=synonyms ) - return ( res_df ) - - -def add_foldchanges ( df, information_df , group='', fc_type=0 , foldchange_indentifier = 'FC,') : - all_vals = list(set(information_df.loc[group].values)) - pair_values = [all_vals[i] for i in range(len(all_vals)) if i<2 ] - group1 = df.iloc[:,[n in pair_values[0] for n in information_df.loc[group].values] ].T - group2 = df.iloc[:,[n in pair_values[1] for n in information_df.loc[group].values] ].T - if fc_type == 0: - FC = np.mean(group1.values,0) - np.mean(group2.values,0) - if fc_type == 1: - FC = np.log2( np.mean(group1.values,0) - np.mean(group2.values,0) ) - FCdf = pd.DataFrame(FC,index=df.index,columns=[foldchange_indentifier+'-'.join(pair_values) ] ) - df = pd.concat([df.T,FCdf.T]).T - return ( df ) - - -from statsmodels.stats.multitest import multipletests -def adjust_p ( pvalue_list , method = 'fdr_bh' , alpha = 0.05, - check_r_bh = False , is_sorted = False , - returnsorted = False - ) : - """ WRAPPER FOR MULTIPLE HYPOTHESIS TESTING - pvalue_list = [0.00001,0.01,0.0002,0.00005,0.01,0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.0114,0.15,0.23,0.20] - """ - available_methods = set( [ 'bonferroni' , 'sidak', - 'holm-sidak' , 'holm' , 'simes-hochberg' , - 'hommel' , 'fdr_bh' , 'fdr_by' , 'fdr_tsbh' , - 'fdr_tsbky' ] ) - if method not in available_methods : - print ( available_methods ) - r_equiv = { 'fdr_bh':'BH' } - if check_r_bh and method in r_equiv : - from rpy2.robjects.packages import importr - from rpy2.robjects.vectors import FloatVector - r_stats = importr('stats') - p_adjust = r_stats.p_adjust ( FloatVector(pvalue_list), method = r_equiv[method] ) - else : - p_adjust_results = multipletests ( pvalue_list, alpha=alpha, method=method, - is_sorted = is_sorted , returnsorted = returnsorted ) - p_adjust = [ p_adj for p_adj in p_adjust_results[1] ] - return ( p_adjust ) - -def qvalues ( p_values_in , pi0 = None ) : - p_s = p_values_in - if pi0 is None : - pi0 = 1. - qs_ = [] - m = float(len(p_s)) ; itype = str( type( p_s[0] ) ) ; added_info = False - if 'list' in itype or 'tuple' in itype : - added_info = True - ps = [ p[0] for p in p_s ] - else : - ps = p_s - frp_ = rankdata( ps,method='ordinal' )/m - ifrp_ = [ ( (p<=f)*f + p*(p>f) ) for p,f in zip(ps,frp_) ] - for ip in range(len(ps)) : - p_ = ps[ ip ] ; f_ = frp_[ip] - q_ = pi0 * p_ / ifrp_[ip] - qs_.append( (q_,p_) ) - if added_info : - q = [ tuple( [qvs[0]]+list(pinf) ) for ( qvs,pinf ) in zip(qs_,p_s) ] - qs_ = q - return qs_ - - -from scipy import stats -from statsmodels.stats.anova import anova_lm as anova -import statsmodels.api as sm -import patsy -def anova_test ( formula, group_expression_df, journal_df, test_type = 'random' ) : - type_d = { 'paired':1 , 'random':2 , 'fixed':1 } - formula = formula.replace(' ','') - tmp_df = pd.concat([ journal_df, group_expression_df ]) - gname = tmp_df.index.tolist()[-1] - formula_l = formula.split('~') - rename = { gname:formula_l[0] } - tmp_df.rename( index=rename, inplace=True ) - tdf = tmp_df.T.iloc[ :,[ col in formula for col in tmp_df.T.columns] ].apply( pd.to_numeric ) - y, X = patsy.dmatrices( formula, tdf, return_type='dataframe') - model = sm.OLS(endog=y,exog=X).fit() - model .model.data.design_info = X.design_info - table = sm.stats.anova_lm(model,typ=type_d[test_type]) - return table.iloc[ [(idx in formula) for idx in table.index],-1] - - -def glm_test ( formula , df , jdf , distribution='Gaussian' ) : - tmp_df = pd.concat([ jdf, df ]) - family_description = """ - Family(link, variance) # The parent class for one-parameter exponential families. - Binomial([link]) # Binomial exponential family distribution. - Gamma([link]) # Gamma exponential family distribution. - Gaussian([link]) # Gaussian exponential family distribution. - InverseGaussian([link]) # InverseGaussian exponential family. - NegativeBinomial([link, alpha]) # Negative Binomial exponential family. - Poisson([link]) # Poisson exponential family. - Tweedie([link, var_power, eql]) # Tweedie family. - """ - if distribution == 'Gaussian' : - family = sm.families.Gaussian() - if distribution == 'Binomial' : - family = sm.families.Binomial() - if distribution == 'Gamma' : - family = sm.families.Gamma() - if distribution == 'InverseGaussian' : - family = sm.families.InverseGaussian() - if distribution == 'NegativeBinomial' : - family = sm.families.NegativeBinomial() - if distribution == 'Poisson' : - family = sm.families.Poisson() - - formula = formula.replace( ' ','' ) - gname = tmp_df.index.tolist()[-1] - formula_l = formula.split('~') - rename = { gname:formula_l[0] } - tmp_df .rename( index=rename, inplace=True ) - - tdf = tmp_df.T.iloc[ :,[ col in formula for col in tmp_df.T.columns] ].apply( pd.to_numeric ) - y , X = patsy.dmatrices( formula, tdf, return_type='dataframe') - distribution_model = sm.GLM( y, X, family=family ) - glm_results = distribution_model.fit() - if False: - print('Parameters: ', glm_results.params ) - print('T-values : ', glm_results.tvalues ) - print('p-values : ', glm_results.pvalues ) - table = glm_results.pvalues - - return table.iloc[ [( idx.split('[')[0] in formula) for idx in table.index]] - -def t_test ( df , endogen = 'expression' , group = 'disease' , - pair_values = ('Sick','Healthy') , test_type = 'independent', - equal_var = False , alternative = 'greater' , - bDeprecated = False ) : - - if bDeprecated : - print ( 'WILL BE REMOVED IN FUTURE VERSIONS' ) - group1 = df[df[group] == pair_values[0]][endogen].astype(float) - group2 = df[df[group] == pair_values[1]][endogen].astype(float) - else : - group1 = df.iloc[:,[n in pair_values[0] for n in df.loc[group,:].values] ].loc[endogen,:].astype(float) - group2 = df.iloc[:,[n in pair_values[1] for n in df.loc[group,:].values] ].loc[endogen,:].astype(float) - - if test_type == 'independent' : - pv = ttest_ind ( group1, group2 , equal_var = equal_var ) - if test_type == 'related' : - pv = ttest_rel ( group1, group2 ) - try : - p_mannu = mannwhitneyu( group1, group2, alternative=alternative )[1] - except ValueError as err: - print(err.args) - p_mannu = 1.0 - pvalue = pv[1] ; statistic=pv[0] - return ( pvalue , p_mannu, statistic ) - -def mycov( x , full_matrices=0 ): - x = x - x.mean( axis=0 ) - U, s, V = np.linalg.svd( x , full_matrices = full_matrices ) - C = np.dot(np.dot(V.T,np.diag(s**2)),V) - return C / (x.shape[0]-1) - -from scipy.special import chdtrc as chi2_cdf -def p_value_merger ( pvalues_df , p_label=',p' , axis = 0 ) : - # - print( " REQUIRED READING: doi: 10.1093/bioinformatics/btw438" ) - print( " ALSO MAKE SURE TO ADD THAT ARTICLE AS ADDITIONAL CITATION" ) - print( " IF THIS METHOD IS EMPLOYED" ) - # - pdf_ = pvalues_df.loc[:,[c for c in pvalues_df.columns.values if p_label in c]] - psi_df = pdf_.apply( lambda x:-2.0*np.log10(x) ) - if axis == 1 : - pdf_ = pdf .T.copy( ) ; psi_df = psi_df.T.copy( ) - - covar_matrix = mycov(psi_df.values) - m = int(covar_matrix.shape[0]) ; K = 2.*m - df_fisher , expectation = K,K - for i in range(m) : - covar_matrix[ i,i ] = 0 - covar_2sum = np.sum( covar_matrix ) - - var = 4.0*m + covar_2sum - c = var / (2.0*expectation) - df_brown = expectation/c - - if df_brown > df_fisher : - df_brown = df_fisher - c = 1.0 - p_values = pvalues_df - - x = 2.0*np.sum ( p_values.apply(lambda X:-np.log10(X)) , 1 ).values - p_brown = chi2_cdf ( df_brown , 1.0*x/c ) - p_fisher = chi2_cdf ( df_fisher, 1.0*x ) - result_df = pd.DataFrame( np.array([p_brown,p_fisher]) , - columns = pvalues_df.index , - index=['Brown,p','Fisher,p'] ).T - return ( result_df ) - -def parse_test ( statistical_formula, group_expression_df , journal_df , test_type = 'random' ) : - # - # THE FALLBACK IS A TYPE2 ANOVA - ident = False - if 'glm' in statistical_formula.lower() : - if not test_type in set(['Gaussian','Binomial','Gamma','InverseGaussian','NegativeBinomial','Poisson']): - test_type = 'Gaussian' - print('ONLY GAUSSIAN TESTS ARE SUPPORTED') - print('THIS TEST IS NO LONGER SUPPORTED') - result = glm_test( statistical_formula, group_expression_df , journal_df , distribution = test_type ) - ident = True - - if 'ttest' in statistical_formula.lower() : - ident = True ; result = None - # - # WE CONDUCT SEPARATE TESTS FOR ALL THE UNIQUE PAIR LABELS PRESENT - check = [ idx for idx in journal_df.index if idx in statistical_formula ] - df = pd.concat( [journal_df,group_expression_df],axis=0 ).T - for c in check : - if test_type in set([ 'related' , 'fixed' , 'paired' ]): - test_type = 'related' - else : - test_type = 'independent' - - for pair in permuter( list(set(journal_df.loc[c].values)),2) : - result_ = t_test( df, endogen = df.columns.values[-1], group = c, - pair_values = pair, test_type = test_type, equal_var = False ) - hdr = ' '.join( [c,' '.join([str(p) for p in pair])] ) - tdf = pd.Series( result_, index = [ hdr, hdr+' mwu', hdr+' stat,s' ] ) - if result is None : - result = tdf - else : - result = pd.concat([result,tdf]) - result.name = 'PR>t' - - if not ident : - result = anova_test( statistical_formula, group_expression_df , journal_df , test_type=test_type ) - - return ( result ) - -def prune_journal ( journal_df , remove_units_on = '_' ) : - journal_df = journal_df.loc[ [ 'label' in idx.lower() or '[' in idx for idx in journal_df.index.values] , : ].copy() - bSel = [ ('label' in idx.lower() ) for idx in journal_df.index.values] - bool_dict = { False:0 , True:1 , 'False':0 , 'True':1 } - str_journal = journal_df.iloc[ bSel ] - journal_df = journal_df.replace({'ND':np.nan}) - nmr_journal = journal_df.iloc[ [ not b for b in bSel ] ].replace(bool_dict).apply( pd.to_numeric ) - if not remove_units_on is None : - nmr_journal.index = [ idx.split(remove_units_on)[0] for idx in nmr_journal.index ] - journal_df = pd.concat( [nmr_journal,str_journal] ) - return( journal_df ) - -def merge_significance ( significance_df , distance_type='euclidean' ) : - # TAKES P VALUES OR Q VALUES - # TRANSFORMS INTO A MERGED P OR Q VALUE VIA - # THE DISTANCE SCORE - # THE DATA DOMAIN SIGNIFICANCE IS ALONG COLUMNS AND - # GROUPS ALONG INDICES - # EX: pd.DataFrame( np.random.rand(20).reshape(5,4) , columns=['bio','cars','oil','money']).apply( lambda x: -1.*np.log10(x) ).T.apply( lambda x: np.sqrt(np.sum(x**2)) ) - # - distance = lambda x : np.sqrt(np.sum(x**2)) - if distance_type == 'euclidean' : # ESTIMATE - distance = lambda x : np.sqrt(np.sum(x**2)) - if distance_type == 'extreme' : # ANTI-CONSERVATIVE ESTIMATE - distance = lambda x : np.max(x) - if distance_type == 'mean' : # MEAN ESTIMATE - distance = lambda x : np.mean(x) - get_pvalue = lambda x : 10**(-x) - return ( significance_df.apply( lambda x: -1.*np.log10(x) ).T.apply(distance).apply(get_pvalue) ) - -def group_significance( subset , all_analytes_df = None , - tolerance = 0.05 , significance_name = 'pVal' , - AllAnalytes = None , SigAnalytes = None, - alternative = 'two-sided' ) : - # FISHER ODDS RATIO CHECK - # CHECK FOR ALTERNATIVE : - # 'greater' ( ENRICHMENT IN GROUP ) - # 'two-sided' ( DIFFERENTIAL GROUP EXPERSSION ) - # 'less' ( DEPLETION IN GROUP ) - if AllAnalytes is None : - if all_analytes_df is None : - AllAnalytes = set( all_analytes_df.index.values ) - if SigAnalytes is None : - if all_analytes_df is None : - SigAnalytes = set( all_analytes_df.iloc[(all_analytes_df 0 : - pv , odds = group_significance( group , AllAnalytes=AllAnalytes, SigAnalytes=SigAnalytes , alternative=alternative ) - rdf = pd.DataFrame( [[pv]], columns = [ group_prefix + 'Fisher_' + p_label ] , index = [ gid ] ) - rdf .columns = [ col+',p' if ',p' not in col else col for col in rdf.columns ] - rdf[ 'description' ] = gdesc+',' + str(L_) ; rdf['analytes'] = str_analytes - rdf[ group_prefix + 'NGroupAnalytes' ] = L_ - rdf[ group_prefix + 'AllFracFilling' ] = L_ / float( len(analytes_) ) - present_sig = set(group.index.values)&SigAnalytes - rdf[ group_prefix + 'SigFracGroupFill' ] = float ( len ( present_sig ) ) / float( len(analytes_) ) - ndf = rdf - if eval_df is None : - eval_df = ndf - else : - eval_df = pd.concat( [ eval_df,ndf ] ) - edf = eval_df.T - for col in eval_df.columns : - if ',p' in col : - q = [q_[0] for q_ in qvalues(eval_df.loc[:,col].values)]; l=col.split(',')[0]+',q' - edf.loc[l] = q - return ( edf.T ) - - - -class APCA ( object ) : - # - # THIS CLASS PERFORMS A SPARSE PCA IF REQUESTED - # IT THEN USES THE SPARSE SVD ALGORITHM FOUND IN SCIPY - # THE STANDARD IS TO USE THE NUMPY SVD - # - def __init__ ( self , X = None , k =-1 , fillna = None , transcending = True , not_sparse = True ) : - from scipy.sparse import csc_matrix - from scipy.sparse.linalg import svds - self.svds_ , self.smatrix_ = svds , csc_matrix - self.components_ = None - self.F_ = None - self.U_ , self.S_, self.V_ = None,None,None - self.evr_ = None - self.var_ = None - self.fillna_ = fillna - self.X_ = self.interpret_input(X) - self.k_ = k - self.transcending_ = transcending - self.not_sparse = not_sparse - - def interpret_input ( self,X ) : - if 'pandas' in str(type(X)) : - for idx in X.index : - X.loc[idx] = [ np.nan if 'str' in str(type(v)) else v for v in X.loc[idx].values ] - if 'float' in str(type(self.fillna_)) or 'int' in str(type(self.fillna_)) : - X = X.fillna(self.fillna_) - self.X_ = X.values - else : - self.X_ = X - return ( self.X_ ) - - def fit ( self , X=None ) : - self.fit_transform( X=X ) - - def fit_transform ( self , X=None ) : - if X is None: - X = self.X_ - if not X is None : - X = self.interpret_input(X) - Xc = X - np.mean( X , 0 ) - if self.k_<=0 : - k_ = np.min( np.shape(Xc) ) - 1 - else: - k_ = self.k_ - - if self.not_sparse : - u, s, v = np.linalg.svd( Xc , full_matrices = False ) - self.transcending_ = False - else : - u, s, v = self.svds_ ( self.smatrix_(Xc, dtype=float) , k=k_ ) - - if self.transcending_ : - u, s, v = self.transcending_order(u,s,v) - S = np.diag( s ) - self.F_ = np.dot(u,S) - self.var_ = s ** 2 / Xc.shape[0] - self.explained_variance_ratio_ = self.var_/self.var_.sum() - self.U_ , self.S_ , self.V_ = u,s,v - self.components_ = self.V_ - return ( self.F_ ) - - def transcending_order(self,u,s,v) : - return ( u[:,::-1],s[::-1],v[::-1,:] ) - - def apply_matrix( self , R ) : - self.U_ = np.dot( self.U_,R.T ) - self.V_ = np.dot( self.V_.T,R.T ).T - self.F_ = np.dot( self.F_,R.T ) - self.components_ = self.V_ - return ( self.F_ ) - - -dimred = PCA() - -def quantify_groups ( analyte_df , journal_df , formula , grouping_file , synonyms = None , - delimiter = '\t' , test_type = 'random' , - split_id = None , skip_line_char = '#' - ) : - statistical_formula = formula - if not split_id is None : - nidx = [ idx.split(split_id)[-1].replace(' ','') for idx in analyte_df.index.values ] - analyte_df.index = nidx - sidx = set( analyte_df.index.values ) ; nidx=len(sidx) - eval_df = None - with open ( grouping_file ) as input: - for line in input: - if line[0] == skip_line_char : - continue - vline = line.replace('\n','').split(delimiter) - gid,gdesc,analytes_ = vline[0],vline[1],vline[2:] - if not synonyms is None : - [ analytes_.append(synonyms[a]) for a in analytes_ if a in synonyms ] - try : - group = analyte_df.loc[[a for a in analytes_ if a in sidx] ].dropna( axis=0, how='any', thresh=analyte_df.shape[1]/2 ).drop_duplicates() - except KeyError as e : - continue - L_ = len( group ) ; str_analytes=','.join(group.index.values) - if L_>0 : - dimred.fit(group.values) - group_expression_df = pd.DataFrame([dimred.components_[0]],columns=analyte_df.columns.values,index=[gid]) - rdf = pd.DataFrame( parse_test( statistical_formula, group_expression_df , journal_df , test_type=test_type )).T - rdf .columns = [ col+',p' if (not ',s' in col) else col+',s' for col in rdf.columns ] - rdf['description'] = gdesc+','+str(L_) - rdf['analytes'] = str_analytes - rdf.index = [ gid ] ; ndf = pd.concat([rdf.T,group_expression_df.T]).T - if eval_df is None : - eval_df = ndf - else : - eval_df = pd.concat([eval_df,ndf]) - edf = eval_df.T - for col in eval_df.columns : - if ',p' in col : - q = [q_[0] for q_ in qvalues(eval_df.loc[:,col].values)]; l=col.split(',')[0]+',q' - edf.loc[l] = q - return ( edf.T ) - -from scipy.stats import combine_pvalues - -def quantify_by_dictionary ( analyte_df , journal_df , formula , split_id=None, - grouping_dictionary = dict() , synonyms = None , - delimiter = ':' ,test_type = 'random', tolerance = 0.05, - supress_q = False , analyte_formula = None, - use_loc_pca=False , k=-1 ) : - - if use_loc_pca : - dimred = APCA(X=analyte_df,k=k) - - if not 'dict' in str(type(grouping_dictionary)) : - print ( 'INVALID GROUPING' ) - return - statistical_formula = formula - if not split_id is None : - nidx = [ idx.split(split_id)[-1].replace(' ','') for idx in analyte_df.index.values ] - analyte_df.index = nidx - sidx = set( analyte_df.index.values ) ; nidx = len(sidx) - eval_df = None - if True : - for line in grouping_dictionary.items() : - gid,analytes_ = line[0],line[1:][0] - gdesc = line[0].split(delimiter)[0] - if not synonyms is None : - [ analytes_.append(synonyms[a]) for a in analytes_ if a in synonyms ] - try : - group = analyte_df.loc[[a for a in analytes_ if a in sidx] ].dropna( axis=0, how='any', thresh=analyte_df.shape[1]/2 ).drop_duplicates() - except KeyError as e : - continue - L_ = len( group ) ; str_analytes=','.join(group.index.values) - if L_>0 : - dimred .fit( group.values ) - ddf = None - for ic in range(len( dimred.components_ )) : - group_expression_df = pd.DataFrame([dimred.components_[ic]],columns=analyte_df.columns.values,index=[gid]) - rdf = pd.DataFrame( parse_test( statistical_formula, group_expression_df , journal_df , test_type=test_type )).T - rdf .columns = [ col+',p' if (not ',s' in col) else col+',s' for col in rdf.columns ] - if ddf is None: - ddf = rdf - else: - ddf = pd.concat([ddf,rdf]) - rdf = pd.DataFrame([ [ combine_pvalues( ddf[c].values )[1] for c in ddf.columns if ',p' in c ] ] , columns=ddf.columns ) - rdf [ 'description' ] = gdesc + ',' + str( L_ ) - rdf [ 'analytes' ] = str_analytes - rdf .index = [ gid ] - if not analyte_formula is None : - group_analytes_pos_neg_ind_d = dict() - qad = quantify_analytes( group , journal_df , analyte_formula , bRegular=False ) - loc_q = qad .loc[ :,[c for c in qad.columns.values if not 'mwu' in c and ',p' in c ] ] - metrics = [ c.split(',')[0] for c in loc_q.columns] - for metric in metrics: - statistic = qad.loc[ :, [c for c in qad.columns if metric in c and ',s' in c] ] - group_analytes_pos_neg_ind_d[ metric + ',N_positive' ] = np.sum ( - [ 1 if p0 else 0 for (p,s) in zip(loc_q.loc[:,[metric+',p']].values,statistic.values) ] - ) - group_analytes_pos_neg_ind_d[ metric + ',N_negative' ] = np.sum ( - [ 1 if ptolerance else 0 for (p,s) in zip(loc_q.loc[:,[metric+',p']].values,statistic.values) ] - ) - group_analytes_pos_neg_ind_d[ metric + ',N_tot' ] = len(statistic) - - loc_a_df = pd.DataFrame(group_analytes_pos_neg_ind_d.items(),columns=['name',gid] ) - loc_a_df.index = loc_a_df['name']; del loc_a_df['name'] - rdf = pd.concat([rdf.T,loc_a_df ]).T - if False : # SUPRESS GROUP EXPRESSION - ndf = pd.concat([rdf.T,group_expression_df.T]).T - else : - ndf = rdf - if eval_df is None : - eval_df = ndf - else : - eval_df = pd.concat([eval_df,ndf]) - edf = eval_df.T - if not supress_q : - for col in eval_df.columns : - if ',p' in col : - q = [q_[0] for q_ in qvalues(eval_df.loc[:,col].values)]; l=col.split(',')[0]+',q' - edf.loc[l] = q - return ( edf.T ) - - - -def quantify_analytes( analyte_df , journal_df , formula , - delimiter = '\t' , test_type = 'random', - verbose = True , only_include = None , - bRegular = True ) : - statistical_formula = formula - sidx = set(analyte_df.index.values) ; nidx=len(sidx) - eval_df = None ; N_ = len(analyte_df) - for iline in range ( len( analyte_df ) ) : - group = analyte_df.iloc[ [iline],: ] - if 'str' in str(type(only_include)) : - if not only_include in group.index : - continue - L_ = len ( group ) ; str_analytes = '.'.join( group.index.values ) - if L_>0 : - gid = group.index.values[0].split('.')[-1].replace(' ','') ; gdesc = group.index.values[0].split('.')[0] - group_expression_df = pd.DataFrame([group.values[0]], columns=analyte_df.columns.values, index=[gid] ) - rdf = pd.DataFrame(parse_test( statistical_formula, group_expression_df, journal_df, test_type=test_type )).T - rdf .columns = [ col+',p' if (not ',s' in col) else col+',s' for col in rdf.columns ] - if bRegular : - rdf['description'] = gdesc+','+str(L_) - rdf['analytes'] = str_analytes - rdf .index = [ gid ] ; ndf = rdf - if bRegular : - ndf = pd.concat([rdf.T,group_expression_df.T]).T - if eval_df is None : - eval_df = ndf - else : - eval_df = pd.concat([eval_df,ndf]) - if verbose : - print ( 'Done:', str(np.floor(float(iline)/N_*1000.)/10.)+'%' , end="\r") - edf = eval_df.T - if not bRegular : - return ( edf.T ) - for col in eval_df.columns : - if ',p' in col : - q = [q_[0] for q_ in qvalues(eval_df.loc[:,col].values)]; l=col.split(',')[0]+',q' - edf.loc[l] = q - return ( edf.T ) - -def group_counts( analyte_df, grouping_file, delimiter = '\t', - tolerance = 0.05 , p_label = 'C(Status),p' , - group_prefix = '' - ) : - AllAnalytes = list( analyte_df.index.values ) - AllAnalytes = set( AllAnalytes ) ; nidx = len( AllAnalytes ) - SigAnalytes = set( analyte_df.iloc[ (analyte_df.loc[:,p_label].values < tolerance), : ].index.values ) - eval_df = None - with open( grouping_file ) as input : - for line in input : - vline = line.replace('\n','').split(delimiter) - gid, gdesc, analytes_ = vline[0], vline[1], vline[2:] - useSigAnalytes = [ a for a in analytes_ if a in SigAnalytes ] - try : - group = analyte_df.loc[ useSigAnalytes ].dropna( axis=0, how='any', thresh=analyte_df.shape[1]/2 ).drop_duplicates() - except KeyError as e : - continue - L_ = len( group ) ; str_analytes=','.join(group.index.values) - if L_ > 0 : - rdf = pd.DataFrame( [[L_,L_/float(len(analytes_))]], columns = [ - group_prefix + 'NsigFrom' + p_label , - group_prefix + 'FsigFrom' + p_label ] , index = [ gid ] ) - ndf = rdf - if eval_df is None : - eval_df = ndf - else : - eval_df = pd.concat ( [eval_df,ndf] ) - return ( eval_df.sort_values( group_prefix + 'FsigFrom' + p_label ) ) - - -def retrieve_genes_of ( group_name, grouping_file, delimiter='\t', identifier='ENSG', skip_line_char='#' ): - all_analytes = [] - with open ( grouping_file ) as input: - for line_ in input : - if skip_line_char == line_[0] : - continue - if group_name is None : - line = line_.replace( '\n','' ) - dline = line.split( delimiter ) - if identifier is None : - [ all_analytes.append(d) for d in dline ] - else : - [ all_analytes.append(d) for d in dline if identifier in d ] - else: - if group_name in line_: - line=line_.replace('\n','') - dline = line.split(delimiter) - if identifier is None : - return ( [ d for d in dline ] ) - else: - return ( [ d for d in dline if identifier in d ] ) - return ( list(set(all_analytes)) ) - -import math -def differential_analytes ( analyte_df , cols = [['a'],['b']] ): - adf = analyte_df.loc[ :,cols[0] ].copy().apply(pd.to_numeric) - bdf = analyte_df.loc[ :,cols[1] ].copy().apply(pd.to_numeric) - regd_l = ( adf.values - bdf.values ) - regd_r = -regd_l - ddf = pd.DataFrame( np.array( [ regd_l.reshape(-1) , regd_r.reshape(-1) , regd_l.reshape(-1)**2 ] ).T , - columns=['DiffL','DiffR','Dist'] , index=adf.index ) - for col in adf.columns : - ddf.loc[:,col] = adf.loc[:,col] - for col in bdf.columns : - ddf.loc[:,col] = bdf.loc[:,col] - ddf = ddf.sort_values('Dist', ascending = False ) - return ( ddf ) - - -def add_kendalltau( analyte_results_df , journal_df , what='M' , sample_names = None, ForceSpearman=False ) : - # ADD IN CONCORDANCE WITH KENDALL TAU - if what in set(journal_df.index.values) : - from scipy.stats import kendalltau,spearmanr - concordance = lambda x,y : kendalltau( x,y ) - if ForceSpearman : - concordance = lambda x,y : spearmanr( x,y ) - K = [] - if sample_names is not None : - patients = sample_names - else : - patients = [ c for c in analyte_results_df.columns if '_' in c ] - patients = [ p for p in sample_names if p in set(patients) & - set( analyte_results_df.columns.values) & - set( journal_df.loc[[what],:].T.dropna().T.columns.values ) ] - for idx in analyte_results_df.index : - y = journal_df.loc[ what,patients ].values - x = analyte_results_df.loc[ [idx],patients ].values[0] # IF DUPLICATE GET FIRST - k = concordance( x,y ) - K .append( k ) - analyte_results_df['KendallTau'] = K - return ( analyte_results_df ) - - -if __name__ == '__main__' : - - test_type = 'random' - - path_ = './' - analyte_file = path_ + 'fine.txt' - journal_file = path_ + 'coarse.txt' - grouping_file = path_ + 'groups.gmt' - - analyte_df = pd.read_csv(analyte_file,'\t' , index_col=0 ) - journal_df = prune_journal( pd.read_csv(journal_file,'\t', index_col=0 ) ) - - print ( quantify_groups( analyte_df, journal_df, 'Group ~ Var + C(Cat) ', grouping_file ) ) diff --git a/build/lib/impetuous/reducer.py b/build/lib/impetuous/reducer.py deleted file mode 100644 index 240693d05d9e741553ee100deb77aad3285213d9..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/reducer.py +++ /dev/null @@ -1,211 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -# -# MOVED OVER FROM RANKOR (reducer.py) AS OF COMMIT : -# https://github.com/richardtjornhammar/rankor/commit/f04a83091a92130b9b73439b4ad65b5be3056cf9 -# -import pandas as pd -import numpy as np -from scipy.stats import rankdata -from impetuous.quantification import qvalues, permuter -# -# REGULAR CONTRAST -contrast = lambda A,B : ( A-B )/( A+B ) -# -# OTHER -e_flatness = lambda x : np.exp(np.mean(np.log(x),0))/np.mean(x,0) -e_contrast = lambda x : 1 - e_flatness(x) - -pi0 = lambda pvs : 1. - -def padded_rolling_average( tv , tau ) : - # AS OF THIS PANDAS VERSION ( 1.1.0 ) - # WINDOW CALCULATION WAS NOT PADDED SO - # THIS IS A NUMPY ONLY IMPLEMENTATION - if tau==1 : - return ( tv ) - if len(tv)0)*((i-w)%N),int(i+w=N)*(N-1)] - idx = [ centered( jid(i,w,N) ) for i in range(N) ] - mvalues = [ np.mean(tv[i[0]:i[1]]) for i in idx ] - return ( mvalues ) - -def svd_reduced_mean ( x,axis=0,keep=[0] ) : - if True : - sk = set ( keep ) - if len ( np.shape(x) ) > 1 : - u , s , vt = np .linalg .svd( x , full_matrices=False ) - xred = np.mean( np.dot(u*[s[i_] if i_ in sk else 0 for i_ in range(len(s))],vt) , axis) - if 'pandas' in str(type(x)) : - if not 'series' in str(type(x)) : - xname = x.index.values[0] - return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) ) - else : - xname = x.name - return ( pd.Series( xred , name=xname , index=x.columns ) ) - else : - return ( xred ) - return ( x ) - -from sklearn.decomposition import PCA -dimred = PCA ( n_components = 1 ) -def pca_reduced_mean( x ) : - if True : - if len ( np.shape(x) ) > 1 : - Xnew = dimred.fit_transform( x.T ) - xred = Xnew . T [0] + np.mean(np.mean(x)) - if 'pandas' in str(type(x)) : - if not 'series' in str(type(x)) : - xname = x.index.values[0] - return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) ) - else : - xname = x.name - return ( pd.Series( xred , name=xname , index=x.columns ) ) - return ( x ) - -def reduction ( a , power , centered=-1 ) : - if centered>0 : - a = ( a.T-np.mean(a,1) ).T - return( np.linalg.svd ( a**power , full_matrices=False ) ) - -def hyper_params ( df_ , label = 'generic' , sep = ',' , power=1., centered=-1 ): - # - idx_ = df_.index.values - N_s = len ( df_.columns ) - u,s,vt = reduction( df_.values , power , centered=centered ) - rdf_ = pd.Series ( np.sum(u**2,1) , index=idx_ , name = label+sep+"u" ) - rdf_ = pd.concat ( [ pd.DataFrame(rdf_) , - pd.DataFrame( pd.Series( np.mean( df_.values,1 ) , - index = idx_ , name=label+sep+"m") ) ] , axis=1 ) - w_ = rdf_ .loc[ :,label+sep+"u" ].values - r_ = rankdata ( [ w for w in w_ ] ,'average' ) - N = len ( r_ ) - # - df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ ).T - # - from scipy.special import erf as erf_ - loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ] - lvs = np.log( df0_.loc[ :,'w_'].values ) - # - return ( np.mean(lvs) , np.std(lvs) ) - - -def hyper_rdf ( df_ , label = 'generic' , sep = ',' , power=1. , - diagnostic_output = False , MEAN=None , STD=None , centered=-1 ) : - # - idx_= df_.index.values - N_s = len ( df_.columns ) - u,s,vt = reduction ( df_.values , power , centered=centered ) - rdf_ = pd.Series ( np.sum( ( u )**2,1 ) , index=idx_ , name = label+sep+"u" ) - rdf_ = pd.concat ( [ pd.DataFrame( rdf_ ) , - pd.DataFrame( pd.Series( np.mean( df_.values,1 ) , - index = idx_ , name=label+sep+"m") ) ] , axis=1 ) - w_ = rdf_ .loc[ :,label+sep+"u" ].values - r_ = rankdata ( [ w for w in w_] ,'average' ) - N = len ( r_ ) - # - # HERE WE CONSTRUCT A DIANGOSTICS AND INTERMITTENT CALCULATION - # DATAFRAME FOR EASIER DEBUGGING AND INSPECTION - df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ ) - df0_ = df0_.T.sort_values ( by='r_' ) - df0_ .loc[ : , 'd_' ] = [ v for v in ( df0_.loc[:, 'w_' ] * df0_.loc[:,'r_'] ) ] - df0_ .loc[ : , 'da_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values ) - # - # HOW MANY EQUALLY OR MORE EXTREME VALUES ARE THERE? ( RANK BASED ) - df0_ .loc[ : , 'dt_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values[::-1] )[::-1] - df0_ .loc[ : , 'rank_ps' ] = df0_ .loc[ :,'dt_' ] / np.sum( df0_ .loc[ :,'d_' ] ) - # - # DRAW HISTOGRAM TO SEE DISTRIBUTION OF THE DISTANCES - from scipy.special import erf as erf_ - loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ] - lvs = np.log( df0_.loc[ :,'w_'].values ) - if MEAN is None or STD is None: - if len(lvs)>3 : - ps = loc_pval( lvs , np.mean( lvs ) , np.std( lvs ) ) - else : - ps = df0_ .loc[ : , 'rank_ps' ] - else : - ps = loc_pval( lvs , MEAN , STD ) - # - if diagnostic_output : - import scipy.stats as scs - NB = 100 - lv = np.log( rdf_.loc[:,[label+sep+"u"]].values ) - y , x = np.histogram( lv , bins=NB , density=True ) - skw = scs.skew(lv)[0] - kur = scs.kurtosis(lv)[0] - shape_stats = "kurtosis: " + "{:.2f} ".format( kur ) + "skewness: "+ "{:.2f}".format( skw ) - locd = lambda x,M,s : (1./s/np.sqrt(2.*np.pi))*np.exp(-0.5*((x-M)/s)**2 ) - lin_x = 0.5 * ( x[1:] + x[:-1] ) - his_y = y - rsy = sorted( [ (y_,x_) for (y_,x_) in zip(y,lin_x) ] )[::-1] - hm , hs = np.mean([rsy[i][1] for i in range(5)]) , np.mean([rsy[i][0] for i in range(5)]) - h_mod_y = locd( lin_x , hm , 1.0/(hs*np.sqrt(2.*np.pi)) ) - d_mod_y = locd( lv , np.mean(lv) , np.std(lv) ) - rem_y = [ (y_-m_) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ] - prc_y = [ 100.*np.abs( contrast(y_,m_) ) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ] - RMSD = np.sqrt(np.sum([ ry**2 for ry in rem_y ])) - PMAE = np.mean(prc_y) - # - df0_ .loc[ : , 'pvalues' ] = ps - # - # ASSIGN THE P VALUES - rdf_.loc[df0_.index.values,label+sep+"p"] = df0_.loc[:,'pvalues'] - rdf_.loc[df0_.index.values,label+sep+"q"] = [ qvs[0] for qvs in qvalues( df0_.loc[:,'pvalues'].values , pi0 = pi0(df0_.loc[:,'pvalues'].values) ) ] - rdf_.loc[df0_.index.values,label+sep+"r"] = df0_ .loc[ : , 'rank_ps' ] - # - # AND RETURN THE RESULTS - if diagnostic_output : - return ( rdf_ , RMSD , PMAE , kur , skw , rem_y ) - else : - return ( rdf_ ) - - -if __name__ == '__main__' : - # - print ( 'REDUCER :: TESTS ' ) - # - a = 2*np.random.rand(10) - b = 4*np.random.rand(10) - X = [ [*(a[:5]+1),*a[5:]],[*(b[:5]+3),*(b[5:])] ] - Xdf = pd.DataFrame( X , columns=['a','b','s','c','d','e','f','g','h','i'] , index=['i0','i1']) - # - print ( Xdf ) - print ( svd_reduced_mean ( X ) ) - print ( svd_reduced_mean ( Xdf ) ) - v0 = [1.9,1.8,2.1,1.1,8.,1.2,2.2,3.5,2.0,2.0,3.1,2.1,2.9] - a2 = np.array([[2,2,2,1,1,1,2,3,2,2,3,2,3],v0]) - NW = 100 - if False: - for i in range(NW): - for w in range(1,i-1): - dat = np.random.rand(i) - pra = padded_rolling_average( dat,tau=w ) - str_pra = ''.join([ str(p) for p in pra ]) - if 'nan' in str_pra: - print ( pra,len(pra),'[',i,w ,']') - else: - if not len(pra)==len(dat): - print ( len(pra)==len(dat),'[',i,w ,']',len(pra),len(dat) ) - - print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(-1,1) ,tau=4 ) ) - print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(1,-1) ,tau=4 ) ) - print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ) ,tau=4 ) ) - print ( padded_rolling_average( v0,tau=4 ) ) - print ( v0 ) diff --git a/build/lib/impetuous/spectral.py b/build/lib/impetuous/spectral.py deleted file mode 100644 index a8676ca6b4736b75633c4b614393c41bf5ea89fb..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/spectral.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -import pandas as pd -import numpy as np -from scipy.stats import rankdata - -beta2M = lambda b : math.log2( (b/(1-b)) ) -M2beta = lambda M : 2**M/(2**M+1) -# -# FRACTIONAL RANKS ON [0,1] ARE RECAST TO +- INFINITY -map_df_to_spectral_order_domain = lambda df: \ - df .apply ( lambda x:(rankdata(x,'average')-0.5)/len(x) ) \ - .apply ( lambda B:[ beta2M(b) for b in B] ) - -power = lambda s : np.dot( s.T,np.conj(s) ) -# -# THE SPECTRUM IS TRANSFORMED INTO A POWER SPECTRUM -spectre_to_power = lambda sdf: \ - pd.DataFrame ( \ - power ( sdf.T.apply(np.fft.fft) ) , \ - index = sdf.index , columns = sdf.index \ - ) -# -# THE BELOW METHOD IS HIGHLY EXPERIMENTAL -def transform_to_resonances( spectrum ) : - # assumes spectrum is a square symmetric matrix - f_ls = np.mean( np.abs(spectrum.iloc[0,:].quantile([0.01,0.99])) ) - reso = spectrum .apply(lambda ser: np.real(ser.to_numpy())) \ - .apply(lambda X:[x/f_ls for x in X]) .apply(M2beta) \ - .apply(lambda X:[ 2.*x-1. for x in X] ) - return ( reso ) -# -# TIME AUTOCORRELATIONS ARE NOT THE SAME THING -# AS PEARSON AUTOCORRELATIONS -def calc_autocorrelation( tv , dt=1 ,bMeanCentered=True ) : - # If you studied statistical mechanics you would - # know about the Wiener Kinchin theorem - if bMeanCentered : - # So for stocks you might want - # to remove the mean - tv-=np.mean(tv) - ftv = np.fft.fft(tv) - rt = np.fft.ifft(ftv*np.conj(ftv)) - rt = rt/rt[0] - rt = rt[:int(np.floor(len(rt)*0.5))] - return( [dt*t for t in range(len(rt))], [np.real(r) for r in rt] ) - -if __name__ == '__main__' : - print ( 'SORRY: NO TESTS HERE' ) - print ( 'DEVELOPMENTAL VERSION') diff --git a/build/lib/impetuous/visualisation.py b/build/lib/impetuous/visualisation.py deleted file mode 100644 index ce946b57d74169d1903c7d6ab6431175b862f80b..0000000000000000000000000000000000000000 --- a/build/lib/impetuous/visualisation.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -Copyright 2020 RICHARD TJÖRNHAMMAR -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import numpy as np -import pandas as pd -import bokeh - -description="""Rudimentary plotting wrappers for impetuous using bokeh""" - -run_in_notebook=""" - from bokeh.plotting import output_notebook, show - output_notebook()""" - -typecheck = lambda x,typ: typ in str(type(x)) -list_typecheck = lambda xl,typ, func: func( [ typecheck(x,typ) for x in xl ] ) - -nice_colors = list( set( [ '#c5c8c6' , '#1d1f21' , '#282a2e' , '#373b41' , '#a54242' , '#cc6666' , - '#8c9440' , '#b5bd68' , '#de935f' , '#f0c674' , '#5f819d' , '#81a2be' , - '#85678f' , '#b294bb' , '#5e8d87' , '#8abeb7' , '#707880' ] ) ) - -make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256) - -def bscatter ( X , Y , additional_dictionary=None , title='' , color='#ff0000' , p=None, legend_label = None , alpha=1 , axis_labels = None ) : - from bokeh.plotting import figure, output_file, ColumnDataSource - from bokeh.models import HoverTool, Range1d, Text - from bokeh.models import Arrow, OpenHead, NormalHead, VeeHead, Line - # - if 'str' in str(type(color)): - colors_ = [ color for v in X ] - else : - colors_ = color - - if 'list' in str(type(alpha)): - alphas_ = alpha - else : - alphas_ = [ alpha for v in X ] - - data = { **{'x' : X , 'y' : Y , - 'color': colors_ , - 'alpha': alphas_ } } - ttips = [ ("index " , "$index" ) , - ("(x,y) " , "(@x, @y)" ) ] - - if not additional_dictionary is None : - if 'dict' in str(type(additional_dictionary)): - data = {**data , **additional_dictionary } - for key in additional_dictionary.keys() : - ttips.append( ( str(key) , '@'+str(key) )) - - source = ColumnDataSource ( data = data ) - hover = HoverTool ( tooltips = ttips ) - # - if p is None : - p = figure ( plot_width=600 , plot_height=600 , - tools = [hover,'box_zoom','wheel_zoom','pan','reset','save'], - title = title ) - - if legend_label is None : - p.circle( 'x' , 'y' , size=12, source=source , color='color', alpha='alpha' ) - else : - p.circle( 'x' , 'y' , size=12, source=source , color='color', alpha='alpha' , legend_label=legend_label ) - - p.xaxis.axis_label = axis_labels[ 0] if not axis_labels is None else 'x' - p.yaxis.axis_label = axis_labels[-1] if not axis_labels is None else 'y' - p.output_backend = 'webgl' - - return( p ) - - -def plotter ( x = np.random.rand(10) , y = np.random.rand(10) , colors = '#ff0000' , - legends=None, axis_labels = None, bSave = False, name='scatter.html' ): - - from bokeh.plotting import output_file, show, save - - output_file( name ) - outp = lambda x: save if bSave else show - - if list_typecheck([x,y,colors],'list',all) : - p = bscatter( [0] , [0] , color = "#ffffff" , alpha=0 ) - for i in range(len(x)) : - x_ , y_ , color = x[i] , y[i] , colors[i] - if list_typecheck([legends,axis_labels],'list',all): - label = legends[i] - p = bscatter( x_ , y_ , color = color , p = p , legend_label = label , axis_labels=axis_labels ) - else : - p = bscatter( x_ , y_ , color = color , p = p ) - outp ( p ) - else : - p = bscatter( x,y, color=colors ) - outp ( p ) - return( p ) diff --git a/dist/impetuous-gfa-0.36.3.tar.gz b/dist/impetuous-gfa-0.36.3.tar.gz deleted file mode 100644 index 23ce042d6551b98255e31c4a334833529d1bc688..0000000000000000000000000000000000000000 Binary files a/dist/impetuous-gfa-0.36.3.tar.gz and /dev/null differ diff --git a/dist/impetuous_gfa-0.36.3-py3-none-any.whl b/dist/impetuous_gfa-0.36.3-py3-none-any.whl deleted file mode 100644 index 18e6ee5a2f9d633df781be5fe907d61ac7388461..0000000000000000000000000000000000000000 Binary files a/dist/impetuous_gfa-0.36.3-py3-none-any.whl and /dev/null differ diff --git a/impetuous_gfa.egg-info/PKG-INFO b/impetuous_gfa.egg-info/PKG-INFO deleted file mode 100644 index abb2a8ceb1bb08cceaab1066744a6962f590ef61..0000000000000000000000000000000000000000 --- a/impetuous_gfa.egg-info/PKG-INFO +++ /dev/null @@ -1,97 +0,0 @@ -Metadata-Version: 2.1 -Name: impetuous-gfa -Version: 0.36.3 -Summary: Impetuous Quantification, a Statistical Learning library for Humans : Alignments, Clustering, Enrichments and Group Analysis -Home-page: https://github.com/richardtjornhammar/impetuous -Author: Richard Tjörnhammar -Author-email: richard.tjornhammar@gmail.com -License: UNKNOWN -Description: # A Statistical Learning library for Humans - Decomposes a set of expressions into a group expression. The toolkit currently offers enrichment analysis, hierarchical enrichment analysis, PLS regression, Shape alignment or clustering as well as rudimentary factor analysis. - - The expression regulation can be studied via a statistical test that relates it to the observables in the journal file. The final p values are then FDR corrected and the resulting adjusted p values are produced. - - Visit the active code via : - https://github.com/richardtjornhammar/impetuous - - Visit the published code : - https://doi.org/10.5281/zenodo.2594690 - - Cite using : - DOI: 10.5281/zenodo.2594690 - - # Pip installation with : - ``` - pip install impetuous-gfa - ``` - - # Version controlled installation of the Impetuous library - - The Impetuous library - - In order to run these code snippets we recommend that you download the nix package manager. Nix package manager links from Oktober 2020: - - https://nixos.org/download.html - - ``` - $ curl -L https://nixos.org/nix/install | sh - ``` - - If you cannot install it using your Wintendo then please consider installing Windows Subsystem for Linux first: - - ``` - https://docs.microsoft.com/en-us/windows/wsl/install-win10 - ``` - - In order to run the code in this notebook you must enter a sensible working environment. Don't worry! We have created one for you. It's version controlled against python3.7 and you can get the file here: - - https://github.com/richardtjornhammar/rixcfgs/blob/master/code/environments/impetuous-shell.nix - - Since you have installed Nix as well as WSL, or use a Linux (NixOS) or bsd like system, you should be able to execute the following command in a termnial: - - ``` - $ nix-shell impetuous-shell.nix - ``` - - Now you should be able to start your jupyter notebook locally: - - ``` - $ jupyter-notebook impetuous_finance.ipynb - ``` - - and that's it. - - # Usage example 1 : elaborate informatics example - - code: https://gitlab.com/stochasticdynamics/eplsmta-experiments - docs: https://arxiv.org/pdf/2001.06544.pdf - - # Usage example 2 : simple code example - - Now while in a good environment: In your Jupyter notebook or just in a dedicated file.py you can write the following: - - ``` - import pandas as pd - import numpy as np - - import impetuous.quantification as impq - - adf = pd.read_csv( 'analytes.csv' , '\t' , index_col=0 ) - jdf = pd.read_csv( 'journal.csv' , '\t' , index_col=0 ) - - res_dfs = impq.run_rpls_regression ( adf , jdf , 'S ~ C(industry)' , owner_by = 'angle' ) - - print ( res_dfs ) - ``` - - # Manually updated code backups for this library : - - GitLab: https://gitlab.com/richardtjornhammar/impetuous - CSDN: https://codechina.csdn.net/m0_52121311/impetuous - - -Platform: UNKNOWN -Classifier: Programming Language :: Python :: 3 -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Operating System :: OS Independent -Description-Content-Type: text/markdown diff --git a/impetuous_gfa.egg-info/SOURCES.txt b/impetuous_gfa.egg-info/SOURCES.txt deleted file mode 100644 index 122341187c5cd826c3e93e8fe8169039c32306bb..0000000000000000000000000000000000000000 --- a/impetuous_gfa.egg-info/SOURCES.txt +++ /dev/null @@ -1,18 +0,0 @@ -README.md -setup.py -impetuous_gfa.egg-info/PKG-INFO -impetuous_gfa.egg-info/SOURCES.txt -impetuous_gfa.egg-info/dependency_links.txt -impetuous_gfa.egg-info/top_level.txt -src/impetuous/__init__.py -src/impetuous/clustering.py -src/impetuous/convert.py -src/impetuous/fit.py -src/impetuous/hierarchal.py -src/impetuous/impetuous.py -src/impetuous/pathways.py -src/impetuous/probabilistic.py -src/impetuous/quantification.py -src/impetuous/reducer.py -src/impetuous/spectral.py -src/impetuous/visualisation.py \ No newline at end of file diff --git a/impetuous_gfa.egg-info/dependency_links.txt b/impetuous_gfa.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/impetuous_gfa.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/impetuous_gfa.egg-info/top_level.txt b/impetuous_gfa.egg-info/top_level.txt deleted file mode 100644 index 971564ff029e659e83bb64f0d5b7e47478e5f0d5..0000000000000000000000000000000000000000 --- a/impetuous_gfa.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -impetuous