denlogv
diff --git a/‎AMRAnalysis.py
Lines changed: 106 additions & 21 deletions b/‎AMRAnalysis.py
Lines changed: 106 additions & 21 deletions
diff --git a/‎README.md
Lines changed: 57 additions & 5 deletions b/‎README.md
Lines changed: 57 additions & 5 deletions
@@ -21,6 +21,16 @@
 
 
 def save_corpus(path, amr_analysis, concatenation=False):
+    """
+    Saves AMR analysis
+
+    Args:
+        path: Path to store analysis in
+        amr_analysis (instance of AMRAnalysis): AMR analysis to be stored
+        concatenation (bool, optional): If True, analysis of concatenated AMR is stored
+
+    """
+
     Path(path).parent.mkdir(parents=True, exist_ok=True)
     with open(path, 'w') as f:
         if concatenation:
@@ -39,6 +49,16 @@ def save_corpus(path, amr_analysis, concatenation=False):
 
 
 def pprint(l, reified=False, **args):
+    """
+    Pretty print function
+
+    Args:
+        l (dict/list/penman.Graph/penman.Tree/str): Instance to be pretty printed
+        reified (bool, optional): If False, instances are reified before printing
+        **args: additional parameters for the python print function
+
+    """
+
     if isinstance(l, dict):
         print('Key\tValue')
         for k, v in l.items():            
@@ -72,7 +92,8 @@ def pprint(l, reified=False, **args):
     else:
         raise ValueError('Unknown type')
     print(**args)
-    
+ 
+ 
 class AMRAnalysis:
     def __init__(self, amr2text_alingnment_path, keep_meta=True, 
                  extended_meta=False, concat_rel=False):
@@ -91,6 +112,14 @@ def __init__(self, amr2text_alingnment_path, keep_meta=True,
 
     @staticmethod
     def reify_rename_graph_from_string(amr_string):
+        """
+        Reifies graph from AMR string
+
+        Args:
+            amr_string(penman.Graph): AMR to be reified in penman format
+
+        """
+    
 
         g1 = reify_attributes(penman.decode(amr_string))
         t1 = layout.configure(g1)
@@ -101,7 +130,14 @@ def reify_rename_graph_from_string(amr_string):
 
     @staticmethod
     def alignment_labels2mrp_labels(amr_string):
-        """Currently works only on reified graphs"""
+        """
+        Currently works only on reified graphs. The function creates a mapping
+        between structure labels '0.0' and mrp labels such as 'MRPNode-1'
+
+        Args:
+            amr_string(penman.Graph): AMR to be reified in penman format
+        """
+
 
         amr_graph = AMRAnalysis.reify_rename_graph_from_string(amr_string)
         epidata, triples = amr_graph.epidata, amr_graph.triples
@@ -155,9 +191,18 @@ def get_alignments_dict_from_string(alignments_string, alignment_pattern, toks,
     @staticmethod
     def get_alignments_dict(nodes_block, labels_dict, alignments_with_toks=False, toks=None):
         """
+        Creates a dictionary of alignments
         This function deals with the problem that was found while using the 
         function above
+
+        Args:
+            nodes_block (list): Block of nodes to get alignments from
+            labels_dict (dict): Dictionary with nodes and corresponding labels
+            alignments_with_toks(bool, optional): If True tokens get aligned
+            toks (list, optional): List with tokens to be aligned, only needed if 'alignments_with_toks=True'
+
         """
+
         nodes_block = [spl_line for spl_line in nodes_block if len(spl_line) == 3]
         alignments_dict = {}
         for spl_line in nodes_block:
@@ -171,7 +216,16 @@ def get_alignments_dict(nodes_block, labels_dict, alignments_with_toks=False, to
 
         return alignments_dict
 
-    def extract_info(self, alignments_with_toks=False):    
+    def extract_info(self, alignments_with_toks=False):
+        """
+        Extracts AMRAnalysis information
+
+
+        Args:
+            alignments_with_toks(bool, optional): If True tokens will be in the alingnments dict,
+            and not the indices of the span like '2-4' 
+        """
+
         with open(self.amr2text_alingnment_path) as f:
             amrs = f.read().strip().split('\n\n')
             amrs = [amr.split('\n') for amr in amrs]
@@ -220,22 +274,29 @@ def extract_info(self, alignments_with_toks=False):
     @staticmethod
     def find_below(labels_dict):
         """
-        Finds nodes below a certain node using a dictionary of the following form
-        (located in 'info_dict[amr_id]['labels_dict']'):
-        
-        Key Value
-        0 MRPNode-0
-        0.0 MRPNode-1
-        0.0.0 MRPNode-2
-        0.0.0.0	MRPNode-3
-        0.0.0.0.0 MRPNode-4
-        0.0.0.0.1 MRPNode-5
-        0.0.1 MRPNode-6
-        0.0.1.0 MRPNode-7
-        
-        Returns a dict where the key is the node label (e.g 'MRPNode-2') and
-        the value is a list with all nodes represented as strings below it.
+        Finds nodes below a certain node
+
+
+        Args:
+            labels_dict (dict): Dictionary of the following form
+                                (located in 'info_dict[amr_id]['labels_dict']'):
+
+                                Key Value
+                                0 MRPNode-0
+                                0.0 MRPNode-1
+                                0.0.0 MRPNode-2
+                                0.0.0.0	MRPNode-3
+                                0.0.0.0.0 MRPNode-4
+                                0.0.0.0.1 MRPNode-5
+                                0.0.1 MRPNode-6
+                                0.0.1.0 MRPNode-7
+
+
+        Returns:
+             nodes_below_dict (dict): Dictionary, where the key is the node label (e.g 'MRPNode-2') and
+                                      the value is a list with all nodes represented as strings below it.
         """
+
         nodes_below_dict = defaultdict(list)
         for key, value in labels_dict.items():
             for k, v in labels_dict.items():
@@ -248,9 +309,16 @@ def full_span(subtree_token_spans):
         """
         Takes a list of token spans of a whole subtree
         and checks, if there are gaps. 
-        
-        Returns a list of indices if a token span is full, else False.
+
+
+        Args:
+            subtree_token_spans (list): List of token spans of a whole subtree
+
+        Returns:
+            toks_indices (list): List of indices if a token span is full
+            False: If token span is incomplete
         """
+
         toks_indices = set()
         for token_span in subtree_token_spans:
             spl = token_span.split('-')
@@ -265,7 +333,15 @@ def full_span(subtree_token_spans):
             return toks_indices
         return None
 
-    def concat_rel(self, rel=':mod'): 
+    def concat_rel(self, rel=':mod'):
+        """
+        Concatenates specified relations in all AMRs available in self.info_dict
+
+        Args:
+            rel (str, optional): Relation to concatenate
+
+        """
+
         if not self.info_dict:
             self.extract_info()
         self.graphs_concat_rel = {}
@@ -334,6 +410,15 @@ def concat_rel(self, rel=':mod'):
 
 
 def do_all_stuff(args):
+    """
+    Function to execute functions of ARMAnalysis and save output
+
+
+    Args:
+        args (list): User-specified arguments collected from the console 
+
+    """
+
 
     if (not args.concat_rel) and (not args.extended_meta):
         output_suffix = 'reif'
 
@@ -1,4 +1,3 @@
-
 ## Introduction:
 #### Here is the official repositorium of our team's software project at the **Ruprecht Karl University of Heidelberg**.
 
@@ -18,6 +17,29 @@ Such examples are commonly represented using _:mod_-relation in AMR. The idea is
     - There are no nodes in $`Subtree(A)`$ (except A) that are used outside of $`Subtree(A) =>`$ has no reentrancies.
     - It corresponds to the complete token span in the sentence.
 <br><br>
+   **Examples:**  
+      
+* Conditions are fulfilled → transformation:
+   
+
+![img_2.png](presentation/chinese_lunar_rover.png)    
+<sub> **Fig. 1:** AMR of _"Chinese lunar rover lands on moon"_ before transformation <endsub>  
+        
+![img_1.png](presentation/chinese_lunar_rover_concat.png)  
+<sub> **Fig. 2:** AMR of _"Chinese lunar rover lands on moon"_ after transformation <endsub>  
+
+* Conditions are violated → no transformation:  
+
+
+![img_3.png](presentation/no_concatenation1_arrows.png)  
+<sub> **Fig. 3:** AMR of _"What more can I do to improve my credit score?"_ got a reentrancy of MRPNode-5 <endsub>
+ 
+ 
+![img_4.png](presentation/no_concatenation_arrows.png)  
+<sub> **Fig. 4:** AMR of _"The final chapter in the trilogy, The Matrix Revolutions, is out in
+  November"_ with incomplete span _"final chapter trilogy the matrix revolutions."_ <endsub>  
+  
+  
 
 2. **Remapping** <br>
     One could argue that it may be undesirable to transform graphs in certain cases. To include an option, where this is not necessary we propose a series of steps:
@@ -42,7 +64,7 @@ In order to run this pipeline you'll need to ensure that following criteria are
 ### Pipeline:
 1. Convert a corpus (a _.txt_-file with a SICK dataset or a folder with an STS dataset) to a _.tsv_ (tab-sepated values)-file. <br> <br> **Functionalities:** <br> <br>
     - `sts2tsv.py` converts a folder with STS-dataset to a single easily readable _.tsv_-file. <br> <br>
-    - `sick2tsv.py` filters a file (.txt file which has a tab-separated-values-layout with 12 columns) with a SICK-dataset to create a .tsv with columns "sent1", "sent2", "sick" (i.e. relatedness-score). <br> <br>
+    - `sick2tsv.py` filters a file (_.txt_-file that has a tab-separated-values-layout with 12 columns) with a SICK-dataset to create a .tsv with columns "sent1", "sent2", "sick" (i.e. relatedness-score). <br> <br>
     In our experiments we filtered the dataset to exclude examples, where sentence pairs have entailment label 'CONTRADICTION'.
     ```
     Usage examples:
@@ -78,10 +100,40 @@ In order to run this pipeline you'll need to ensure that following criteria are
     python3 AMRAnalysis.py -i data/amr/SICK2014_corpus_a_aligned.mrp data/amr/SICK2014_corpus_b_aligned.mrp --output_prefix analysis/sick/SICK2014 --extended_meta
     ```
 ---    
-5. Run $`S^2Match`$ on  the resulting _AMR_-files.
-6. Evaluate by computing _Spearman rank_ and _Pearson correlation coefficients_ + Visualise the results. <br> <br>
+5. Run $`S^2Match`$ on  the resulting _AMR_-files. Our modified scripts accept 3 kind of inputs without this needing to be explicitly specified: 
+	- Original _AMR_-graphs without any modifications or transformations (associated files in `analysis/sick`, `analysis/sts` have the suffix **_reif.amr** because the graphs are reified – our algorithm needs it for extracting metadata, so we found it more "fair" to compare results, where all initial graphs were the same.)
+	- Graphs, which contain alignment metadata in the typical _AMR_-format (e.g. _# ::labels_dict {"0": "MRPNode-0" ...}_). Associated files in `analysis/sick`, `analysis/sts` have the suffix **_reif_ext.amr**
+	- Transformed graphs with all parent nodes that have a _:mod_-relation merged with their subtree (if there is a corresponding full token span and no reeintrancies.) Associated files in `analysis/sick`, `analysis/sts` have the suffix **_concat.amr** <br>
+There are 2 relevant files in `amr_suite/py3-Smatch-and-S2match/smatch` that are our modified versions of the original $`S^2Match`$-code (all additions/changes are labeled **#SWP**):<br> <br>
+**Functionalities:** <br><br>
+	- `smatchdev_glove.py` uses GloVe-Embeddings
+	- `smatchdev_sbert.py` uses [sentence-transformers](sbert.net)
+	```
+	Usage example:
+	
+	python3 amr_suite/py3-Smatch-and-S2match/smatch/s2matchdev_glove.py \
+    -f analysis/sick/SICK2014_corpus_a_reif.amr analysis/sick/SICK2014_corpus_b_reif.amr \
+    -vectors amr_suite/vectors/glove.6B.100d.txt \
+    -diffsense 0.5 -cutoff 0.5 -v --ms \
+    > analysis/sick/s2match_glove_results/SICK2014_orig_results_full.txt
+	```	
+---	
+6. Evaluate by computing _Spearman rank_ and _Pearson correlation coefficients_ and visualising the results. <br> <br>
 **Functionalities:** <br> <br>
-	- for steps 5 and 6 please consult our Jupyter Notebook  [`walkthrough.ipynb`](https://gitlab.com/denlogv/measuring-variation-in-amr/-/blob/master/walkthrough.ipynb). Standalone scripts will be added soon. 
+	- `results2png` creates 2 heatmaps of the _Pearson/Spearman-correlation coefficients_ using either a _.tsv_-file, where all the necessary scores are available or using a _.tsv_-file and 2 folders (one for the $`S^2Match`$-results with the _GloVe_-Embeddings and another for the results using _SBERT_-models) 
+	```
+	Usage examples:
+	
+	python3 results2png.py --dataset STS --gold data/STS2016_full_fix.tsv \
+    --smatch analysis/sts/s2match_glove_results analysis/sts/s2match_sbert_results \
+    --output analysis/sts/s2match_modification_results.png 
+       
+	python3 results2png.py --dataset SICK --gold analysis/SICK2014_full_scores.tsv \
+    --output analysis/sick/s2match_modification_results.png
+	```	
+<br> <br>
+![Our results on SICK:](analysis/sick/s2match_modification_results.png)<sub> **Fig. 5:** Our results on SICK: <endsub>  
+![Our results on STS:](analysis/sts/s2match_modification_results.png)<sub> **Fig. 6:** Our results on STS: <endsub> 
 ---
 ### Folders:
 We have been working with a lot of data, so we feel that a good overview would facilitate working with this repository. <br>