Skip to content

Commit 6416e5e

Browse files
committed
Polished the repository, updated Readme, updated walkthrough.ipynb, added some function docstrings in the scripts.
1 parent 5001951 commit 6416e5e

12 files changed

+8874
-339
lines changed

AMRAnalysis.py

Lines changed: 106 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,16 @@
2121

2222

2323
def save_corpus(path, amr_analysis, concatenation=False):
24+
"""
25+
Saves AMR analysis
26+
27+
Args:
28+
path: Path to store analysis in
29+
amr_analysis (instance of AMRAnalysis): AMR analysis to be stored
30+
concatenation (bool, optional): If True, analysis of concatenated AMR is stored
31+
32+
"""
33+
2434
Path(path).parent.mkdir(parents=True, exist_ok=True)
2535
with open(path, 'w') as f:
2636
if concatenation:
@@ -39,6 +49,16 @@ def save_corpus(path, amr_analysis, concatenation=False):
3949

4050

4151
def pprint(l, reified=False, **args):
52+
"""
53+
Pretty print function
54+
55+
Args:
56+
l (dict/list/penman.Graph/penman.Tree/str): Instance to be pretty printed
57+
reified (bool, optional): If False, instances are reified before printing
58+
**args: additional parameters for the python print function
59+
60+
"""
61+
4262
if isinstance(l, dict):
4363
print('Key\tValue')
4464
for k, v in l.items():
@@ -72,7 +92,8 @@ def pprint(l, reified=False, **args):
7292
else:
7393
raise ValueError('Unknown type')
7494
print(**args)
75-
95+
96+
7697
class AMRAnalysis:
7798
def __init__(self, amr2text_alingnment_path, keep_meta=True,
7899
extended_meta=False, concat_rel=False):
@@ -91,6 +112,14 @@ def __init__(self, amr2text_alingnment_path, keep_meta=True,
91112

92113
@staticmethod
93114
def reify_rename_graph_from_string(amr_string):
115+
"""
116+
Reifies graph from AMR string
117+
118+
Args:
119+
amr_string(penman.Graph): AMR to be reified in penman format
120+
121+
"""
122+
94123

95124
g1 = reify_attributes(penman.decode(amr_string))
96125
t1 = layout.configure(g1)
@@ -101,7 +130,14 @@ def reify_rename_graph_from_string(amr_string):
101130

102131
@staticmethod
103132
def alignment_labels2mrp_labels(amr_string):
104-
"""Currently works only on reified graphs"""
133+
"""
134+
Currently works only on reified graphs. The function creates a mapping
135+
between structure labels '0.0' and mrp labels such as 'MRPNode-1'
136+
137+
Args:
138+
amr_string(penman.Graph): AMR to be reified in penman format
139+
"""
140+
105141

106142
amr_graph = AMRAnalysis.reify_rename_graph_from_string(amr_string)
107143
epidata, triples = amr_graph.epidata, amr_graph.triples
@@ -155,9 +191,18 @@ def get_alignments_dict_from_string(alignments_string, alignment_pattern, toks,
155191
@staticmethod
156192
def get_alignments_dict(nodes_block, labels_dict, alignments_with_toks=False, toks=None):
157193
"""
194+
Creates a dictionary of alignments
158195
This function deals with the problem that was found while using the
159196
function above
197+
198+
Args:
199+
nodes_block (list): Block of nodes to get alignments from
200+
labels_dict (dict): Dictionary with nodes and corresponding labels
201+
alignments_with_toks(bool, optional): If True tokens get aligned
202+
toks (list, optional): List with tokens to be aligned, only needed if 'alignments_with_toks=True'
203+
160204
"""
205+
161206
nodes_block = [spl_line for spl_line in nodes_block if len(spl_line) == 3]
162207
alignments_dict = {}
163208
for spl_line in nodes_block:
@@ -171,7 +216,16 @@ def get_alignments_dict(nodes_block, labels_dict, alignments_with_toks=False, to
171216

172217
return alignments_dict
173218

174-
def extract_info(self, alignments_with_toks=False):
219+
def extract_info(self, alignments_with_toks=False):
220+
"""
221+
Extracts AMRAnalysis information
222+
223+
224+
Args:
225+
alignments_with_toks(bool, optional): If True tokens will be in the alingnments dict,
226+
and not the indices of the span like '2-4'
227+
"""
228+
175229
with open(self.amr2text_alingnment_path) as f:
176230
amrs = f.read().strip().split('\n\n')
177231
amrs = [amr.split('\n') for amr in amrs]
@@ -220,22 +274,29 @@ def extract_info(self, alignments_with_toks=False):
220274
@staticmethod
221275
def find_below(labels_dict):
222276
"""
223-
Finds nodes below a certain node using a dictionary of the following form
224-
(located in 'info_dict[amr_id]['labels_dict']'):
225-
226-
Key Value
227-
0 MRPNode-0
228-
0.0 MRPNode-1
229-
0.0.0 MRPNode-2
230-
0.0.0.0 MRPNode-3
231-
0.0.0.0.0 MRPNode-4
232-
0.0.0.0.1 MRPNode-5
233-
0.0.1 MRPNode-6
234-
0.0.1.0 MRPNode-7
235-
236-
Returns a dict where the key is the node label (e.g 'MRPNode-2') and
237-
the value is a list with all nodes represented as strings below it.
277+
Finds nodes below a certain node
278+
279+
280+
Args:
281+
labels_dict (dict): Dictionary of the following form
282+
(located in 'info_dict[amr_id]['labels_dict']'):
283+
284+
Key Value
285+
0 MRPNode-0
286+
0.0 MRPNode-1
287+
0.0.0 MRPNode-2
288+
0.0.0.0 MRPNode-3
289+
0.0.0.0.0 MRPNode-4
290+
0.0.0.0.1 MRPNode-5
291+
0.0.1 MRPNode-6
292+
0.0.1.0 MRPNode-7
293+
294+
295+
Returns:
296+
nodes_below_dict (dict): Dictionary, where the key is the node label (e.g 'MRPNode-2') and
297+
the value is a list with all nodes represented as strings below it.
238298
"""
299+
239300
nodes_below_dict = defaultdict(list)
240301
for key, value in labels_dict.items():
241302
for k, v in labels_dict.items():
@@ -248,9 +309,16 @@ def full_span(subtree_token_spans):
248309
"""
249310
Takes a list of token spans of a whole subtree
250311
and checks, if there are gaps.
251-
252-
Returns a list of indices if a token span is full, else False.
312+
313+
314+
Args:
315+
subtree_token_spans (list): List of token spans of a whole subtree
316+
317+
Returns:
318+
toks_indices (list): List of indices if a token span is full
319+
False: If token span is incomplete
253320
"""
321+
254322
toks_indices = set()
255323
for token_span in subtree_token_spans:
256324
spl = token_span.split('-')
@@ -265,7 +333,15 @@ def full_span(subtree_token_spans):
265333
return toks_indices
266334
return None
267335

268-
def concat_rel(self, rel=':mod'):
336+
def concat_rel(self, rel=':mod'):
337+
"""
338+
Concatenates specified relations in all AMRs available in self.info_dict
339+
340+
Args:
341+
rel (str, optional): Relation to concatenate
342+
343+
"""
344+
269345
if not self.info_dict:
270346
self.extract_info()
271347
self.graphs_concat_rel = {}
@@ -334,6 +410,15 @@ def concat_rel(self, rel=':mod'):
334410

335411

336412
def do_all_stuff(args):
413+
"""
414+
Function to execute functions of ARMAnalysis and save output
415+
416+
417+
Args:
418+
args (list): User-specified arguments collected from the console
419+
420+
"""
421+
337422

338423
if (not args.concat_rel) and (not args.extended_meta):
339424
output_suffix = 'reif'

README.md

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
## Introduction:
32
#### Here is the official repositorium of our team's software project at the **Ruprecht Karl University of Heidelberg**.
43

@@ -18,6 +17,29 @@ Such examples are commonly represented using _:mod_-relation in AMR. The idea is
1817
- There are no nodes in $`Subtree(A)`$ (except A) that are used outside of $`Subtree(A) =>`$ has no reentrancies.
1918
- It corresponds to the complete token span in the sentence.
2019
<br><br>
20+
**Examples:**
21+
22+
* Conditions are fulfilled → transformation:
23+
24+
25+
![img_2.png](presentation/chinese_lunar_rover.png)
26+
<sub> **Fig. 1:** AMR of _"Chinese lunar rover lands on moon"_ before transformation <endsub>
27+
28+
![img_1.png](presentation/chinese_lunar_rover_concat.png)
29+
<sub> **Fig. 2:** AMR of _"Chinese lunar rover lands on moon"_ after transformation <endsub>
30+
31+
* Conditions are violated → no transformation:
32+
33+
34+
![img_3.png](presentation/no_concatenation1_arrows.png)
35+
<sub> **Fig. 3:** AMR of _"What more can I do to improve my credit score?"_ got a reentrancy of MRPNode-5 <endsub>
36+
37+
38+
![img_4.png](presentation/no_concatenation_arrows.png)
39+
<sub> **Fig. 4:** AMR of _"The final chapter in the trilogy, The Matrix Revolutions, is out in
40+
November"_ with incomplete span _"final chapter trilogy the matrix revolutions."_ <endsub>
41+
42+
2143

2244
2. **Remapping** <br>
2345
One could argue that it may be undesirable to transform graphs in certain cases. To include an option, where this is not necessary we propose a series of steps:
@@ -42,7 +64,7 @@ In order to run this pipeline you'll need to ensure that following criteria are
4264
### Pipeline:
4365
1. Convert a corpus (a _.txt_-file with a SICK dataset or a folder with an STS dataset) to a _.tsv_ (tab-sepated values)-file. <br> <br> **Functionalities:** <br> <br>
4466
- `sts2tsv.py` converts a folder with STS-dataset to a single easily readable _.tsv_-file. <br> <br>
45-
- `sick2tsv.py` filters a file (.txt file which has a tab-separated-values-layout with 12 columns) with a SICK-dataset to create a .tsv with columns "sent1", "sent2", "sick" (i.e. relatedness-score). <br> <br>
67+
- `sick2tsv.py` filters a file (_.txt_-file that has a tab-separated-values-layout with 12 columns) with a SICK-dataset to create a .tsv with columns "sent1", "sent2", "sick" (i.e. relatedness-score). <br> <br>
4668
In our experiments we filtered the dataset to exclude examples, where sentence pairs have entailment label 'CONTRADICTION'.
4769
```
4870
Usage examples:
@@ -78,10 +100,40 @@ In order to run this pipeline you'll need to ensure that following criteria are
78100
python3 AMRAnalysis.py -i data/amr/SICK2014_corpus_a_aligned.mrp data/amr/SICK2014_corpus_b_aligned.mrp --output_prefix analysis/sick/SICK2014 --extended_meta
79101
```
80102
---
81-
5. Run $`S^2Match`$ on the resulting _AMR_-files.
82-
6. Evaluate by computing _Spearman rank_ and _Pearson correlation coefficients_ + Visualise the results. <br> <br>
103+
5. Run $`S^2Match`$ on the resulting _AMR_-files. Our modified scripts accept 3 kind of inputs without this needing to be explicitly specified:
104+
- Original _AMR_-graphs without any modifications or transformations (associated files in `analysis/sick`, `analysis/sts` have the suffix **_reif.amr** because the graphs are reified – our algorithm needs it for extracting metadata, so we found it more "fair" to compare results, where all initial graphs were the same.)
105+
- Graphs, which contain alignment metadata in the typical _AMR_-format (e.g. _# ::labels_dict {"0": "MRPNode-0" ...}_). Associated files in `analysis/sick`, `analysis/sts` have the suffix **_reif_ext.amr**
106+
- Transformed graphs with all parent nodes that have a _:mod_-relation merged with their subtree (if there is a corresponding full token span and no reeintrancies.) Associated files in `analysis/sick`, `analysis/sts` have the suffix **_concat.amr** <br>
107+
There are 2 relevant files in `amr_suite/py3-Smatch-and-S2match/smatch` that are our modified versions of the original $`S^2Match`$-code (all additions/changes are labeled **#SWP**):<br> <br>
108+
**Functionalities:** <br><br>
109+
- `smatchdev_glove.py` uses GloVe-Embeddings
110+
- `smatchdev_sbert.py` uses [sentence-transformers](sbert.net)
111+
```
112+
Usage example:
113+
114+
python3 amr_suite/py3-Smatch-and-S2match/smatch/s2matchdev_glove.py \
115+
-f analysis/sick/SICK2014_corpus_a_reif.amr analysis/sick/SICK2014_corpus_b_reif.amr \
116+
-vectors amr_suite/vectors/glove.6B.100d.txt \
117+
-diffsense 0.5 -cutoff 0.5 -v --ms \
118+
> analysis/sick/s2match_glove_results/SICK2014_orig_results_full.txt
119+
```
120+
---
121+
6. Evaluate by computing _Spearman rank_ and _Pearson correlation coefficients_ and visualising the results. <br> <br>
83122
**Functionalities:** <br> <br>
84-
- for steps 5 and 6 please consult our Jupyter Notebook [`walkthrough.ipynb`](https://gitlab.com/denlogv/measuring-variation-in-amr/-/blob/master/walkthrough.ipynb). Standalone scripts will be added soon.
123+
- `results2png` creates 2 heatmaps of the _Pearson/Spearman-correlation coefficients_ using either a _.tsv_-file, where all the necessary scores are available or using a _.tsv_-file and 2 folders (one for the $`S^2Match`$-results with the _GloVe_-Embeddings and another for the results using _SBERT_-models)
124+
```
125+
Usage examples:
126+
127+
python3 results2png.py --dataset STS --gold data/STS2016_full_fix.tsv \
128+
--smatch analysis/sts/s2match_glove_results analysis/sts/s2match_sbert_results \
129+
--output analysis/sts/s2match_modification_results.png
130+
131+
python3 results2png.py --dataset SICK --gold analysis/SICK2014_full_scores.tsv \
132+
--output analysis/sick/s2match_modification_results.png
133+
```
134+
<br> <br>
135+
![Our results on SICK:](analysis/sick/s2match_modification_results.png)<sub> **Fig. 5:** Our results on SICK: <endsub>
136+
![Our results on STS:](analysis/sts/s2match_modification_results.png)<sub> **Fig. 6:** Our results on STS: <endsub>
85137
---
86138
### Folders:
87139
We have been working with a lot of data, so we feel that a good overview would facilitate working with this repository. <br>

0 commit comments

Comments
 (0)