From c0f827b56ddbc0f259d9d617754cac913e0e9998 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Richard=20Tj=C3=B6rnhammar?= <richard.tjornhammar@gmail.com>
Date: Wed, 23 Jun 2021 11:05:49 +0200
Subject: [PATCH] encoding method

---
 README.md                       |  2 +-
 setup.py                        |  2 +-
 src/impetuous/quantification.py | 40 +++++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8a37957..f7101d4 100755
--- a/README.md
+++ b/README.md
@@ -298,7 +298,7 @@ which will report that
 |Hierarchical,p  |   6.55459e-05 |
 |Name: |  R-HSA-390522 |
 
-is affected or perhaps needs to be compensated for... now perhaps you thought this excerice was a tad tedious? Well you are correct. It is and you could just as well have copied the gene transcripts into [String-db](https://string-db.org/cgi/input?sessionId=beIptQQxF85j&input_page_active_form=multiple_identifiers) and gotten similar results out. But, then you wouldn't have gotten to use the hierarchical enrichment method I invented!
+is affected or perhaps needs to be compensated for... now perhaps you thought this exercise was a tad tedious? Well you are correct. It is and you could just as well have copied the gene transcripts into [String-db](https://string-db.org/cgi/input?sessionId=beIptQQxF85j&input_page_active_form=multiple_identifiers) and gotten similar results out. But, then you wouldn't have gotten to use the hierarchical enrichment method I invented!
 
 These examples were meant as illustrations of some of the codes implemented in the impetuous-gfa package.
 
diff --git a/setup.py b/setup.py
index 367be17..c468678 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
 
 setuptools.setup(
     name = "impetuous-gfa",
-    version = "0.50.1",
+    version = "0.51.0",
     author = "Richard Tjörnhammar",
     author_email = "richard.tjornhammar@gmail.com",
     description = "Impetuous Quantification, a Statistical Learning library for Humans : Alignments, Clustering, Enrichments and Group Analysis",
diff --git a/src/impetuous/quantification.py b/src/impetuous/quantification.py
index 362c3de..1fc4aa6 100755
--- a/src/impetuous/quantification.py
+++ b/src/impetuous/quantification.py
@@ -134,6 +134,46 @@ def find_category_interactions ( istr ) :
     interacting_categories = [ [all_cats[i-1],all_cats[i]] for i in range(1,len(interacting)) if interacting[i] ]
     return ( interacting_categories )
 
+
+def create_encoding_data_frame ( journal_df , formula , bVerbose=False ) :
+    #
+    # THE JOURNAL_DF IS THE COARSE GRAINED DATA (THE MODEL)
+    # THE FORMULA IS THE SEMANTIC DESCRIPTION OF THE PROBLEM
+    #
+    interaction_pairs = find_category_interactions ( formula.split('~')[1] )
+    add_pairs = []
+    sjdf = set(journal_df.index)
+    if len( interaction_pairs ) > 0 :
+        for pair in interaction_pairs :
+            cpair = [ 'C('+p+')' for p in pair ]
+            upair = [ pp*(pp in sjdf)+cp*(cp in sjdf and not pp in sjdf) for (pp,cp) in zip( pair,cpair) ]
+            journal_df.loc[ ':'.join(upair) ] = [ p[0]+'-'+p[1] for p in journal_df.loc[ upair,: ].T.values ]
+            add_pairs.append(':'.join(upair))
+    use_categories = list(set(find_category_variables(formula.split('~')[1])))
+    cusecats = [ 'C('+p+')' for p in use_categories ]
+    use_categories = [ u*( u in sjdf) + cu *( cu in sjdf ) for (u,cu) in zip(use_categories,cusecats) ]
+    use_categories = [ *use_categories,*add_pairs ]
+    #
+    if len( use_categories )>0 :
+        encoding_df = create_encoding_journal ( use_categories , journal_df ).T
+    else :
+        encoding_df = None
+    #
+    if bVerbose :
+        print ( [ v for v in encoding_df.columns.values ] )
+        print ( 'ADD IN ANY LINEAR TERMS AS THEIR OWN AXIS' )
+    #
+    # THIS TURNS THE MODEL INTO A MIXED LINEAR MODEL
+    add_df = journal_df.loc[ [c.replace(' ','') for c in formula.split('~')[1].split('+') if not 'C('in c],: ]
+    if len(add_df)>0 :
+        if encoding_df is None :
+            encoding_df = add_df.T
+        else :
+            encoding_df = pd.concat([ encoding_df.T ,
+                            journal_df.loc[ [ c.replace(' ','') for c in formula.split('~')[1].split('+') if not 'C(' in c] , : ] ]).T
+    return ( encoding_df )
+
+
 def interpret_problem ( analyte_df , journal_df , formula , bVerbose=False ) :
     #
     # THE JOURNAL_DF IS THE COARSE GRAINED DATA (THE MODEL)
-- 
GitLab