CodeFuse-RAMP/eval_graph_model.py at main · codefuse-ai/CodeFuse-RAMP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
"""
Evaluation script for RAMP
"""
import json
from pathlib import Path
import time
from typing import List, Tuple, Any
from typing import Optional
from typing_extensions import runtime

import torch
from dataclasses import dataclass, field, asdict
from transformers import HfArgumentParser
from torch import Tensor
from transformers import AutoTokenizer
from transformers.modeling_outputs import BaseModelOutputWithPast

from eval_graph_src.eval_utils import (
    dump_jsonl,
    load_data,
    get_answer,
    DATA_NAME_TO_MAX_NEW_TOKENS,
)

from accelerate import Accelerator, InitProcessGroupKwargs
from src import get_model_and_tokenizer,ModelArgs
from src.util import DatasetProcessFn, add_eos
from datetime import timedelta

######### Compute Metrics #########
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
######### Compute Metrics #########

# Compression by Ratio
import math

# compute time
import time

MAX_POSITION_ID = 256 * 1024  # Determined by the model
TRUNCATE_LEN = 256 * 1024
device = torch.device("cuda")

@dataclass
class Args(ModelArgs):
    eval_data: str = field(
        default="activation-beacon:lm/pg19.json",
        metadata={'help': 'The evaluation json data path.'}
    )
    output_dir: str = field(
        default="data/results/lm/",
        metadata={'help': 'Output directory for results and logs.'}
    )

    retokenize: bool = field(
        default=False,
        metadata={'help': 'Retokenize the corpus?'}
    )
    tokenize_max_char: Optional[int] = field(
        default=None,
        metadata={'help': 'The number of chars to truncate.'}
    )

    batch_size: int = field(
        default=1,
        metadata={'help': 'Evaluation batch size.'}
    )
    padding_side: str = field(
        default="right",
        metadata={'help': 'Which side to pad?'}
    )
    stride: int = field(
        default=2048,
        metadata={'help': 'Streaming stride when evaluating perplexity.'}
    )
    max_sample_num: int = field(
        default=100,
        metadata={'help': 'How many samples to evaluate in eval_data?'}
    )
    min_length: Optional[int] = field(
        default=None,
        metadata={'help': 'Minimum length for input_ids.'}
    )
    ###### RAMP ######
    training_stage: str = field(
        default="finetune",
        metadata={'help': 'What training stage? pretrain or finetune.'}
    )
    mp_layer_num: int = field(
        default=3,
        metadata={'help': 'The number of MP layers'}
    )
    c_ratio: float = field(
        default=0.1,
        metadata={'help': 'Compression Ratio on Node Content'}
    )
    ###### RAMP ######

def process_input(eg, c_ratio=0.1):

    question = eg["question"]
    answer = eg["answer"]
    context = eg["context"]
    node_labels = eg["node_labels"]
    nbr_idx = eg["nbr_idx"]
    instance_id = eg["instance_id"]

    chunk_size = []
    labels = None
    node_labels = node_labels
    tokenized_chunks = []

    # Compression by Ratio
    beacon_sizes = []

    for i, node_context in enumerate(context):
        chunk = node_context
        tokenized_chunk = tokenizer.encode(chunk, add_special_tokens=False)
        tokenized_chunks.append(tokenized_chunk) # node token

        raw_chunk_length = len(tokenized_chunk)
        chunk_size.append(raw_chunk_length) # chunk size
        beacon_sizes.append(math.ceil(raw_chunk_length * c_ratio)) # beacon size

    # Encode question and context
    tokenized_question = tokenizer.encode(question, add_special_tokens=False)
    tokenized_chunks.append(tokenized_question)
    chunk_size.append(len(tokenized_question))

    # Concatenate token IDs of all chunks
    combined_tokens = []
    for tokenized_chunk in tokenized_chunks:
        combined_tokens.extend(tokenized_chunk)

    input = tokenizer.decode(combined_tokens, skip_special_tokens=True)
    inputs = tokenizer(input)
    # Reset input_ids
    inputs["input_ids"] = combined_tokens

    length_before = len(combined_tokens)
    length_after = len(inputs["input_ids"])

    if length_before != length_after: # Bug occurs when lengths are inconsistent
        print(f"===== ERROR CASE: {instance_id} =====")

    ########## Create labels for generation task ###########
    labels = inputs["input_ids"].copy()
    ########## Create labels for generation task ###########

    inputs["node_labels"] = node_labels # Node labels
    inputs["chunk_sizes"] = chunk_size
    inputs["beacon_sizes"] = beacon_sizes

    # Add neighbor information / edge information
    inputs["nbr_idx"] = nbr_idx

    # length is required for grouping
    inputs["length"] = len(inputs["input_ids"])
    return inputs["input_ids"], answer, inputs["chunk_sizes"], inputs["nbr_idx"], inputs["node_labels"], instance_id, inputs["beacon_sizes"]

def pad_to_equal_length(sequences, pad_value=-1):
    max_length = max([len(seq) for seq in sequences])

    # Create new padded list
    padded_sequences = []
    for seq in sequences:
        padding_length = max_length - len(seq)
        padded_seq = seq + [pad_value] * padding_length
        padded_sequences.append(padded_seq)

    return padded_sequences

parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]

# increase timeout to avoid error
accelerator = Accelerator(cpu=args.cpu, kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=100000))])
model, tokenizer = get_model_and_tokenizer(args, mp_layer_num=args.mp_layer_num, accelerator=accelerator)

model.eval()

if __name__ == "__main__":

    DEBUG = False
    eval_data_dir = "./data_preprocess/cora/processed/"

    print(f"==== args mp_layer_num: {args.mp_layer_num} ====")
    print(f"==== args c_ratio: {args.c_ratio} ====")

    data_names = [args.eval_data]
    for data_name in data_names:
        # Model
        max_tokens = DATA_NAME_TO_MAX_NEW_TOKENS[data_name]

        # Data
        result_dir = Path(args.output_dir)
        result_dir.mkdir(exist_ok=True, parents=True)
        examples = load_data(data_name, data_dir=eval_data_dir)
        start_idx = 0
        stop_idx = len(examples)

        output_path = (
            result_dir / f"preds_{data_name}.jsonl"
        )

        preds = []
        print("==== Evaluation ====")
        print(f"# examples: {len(examples)}")
        print(f"Start index: {start_idx}")
        print(f"Stop index: {stop_idx}")
        print(f"Max tokens: {max_tokens}")
        ###########
        ###########
        for i in range(start_idx, stop_idx):
            eg = examples[i]
            input_ids, answer, chunk_sizes, nbr_idx, node_labels, instance_id, beacon_sizes = process_input(eg, c_ratio=args.c_ratio)

            print(f"====== Example {i}: {instance_id} ======")
            input_ids = torch.tensor(input_ids)
            input_ids = input_ids.unsqueeze(0)
            input_ids = input_ids.to(device)
            ######
            chunk_sizes = torch.tensor(chunk_sizes)
            chunk_sizes = chunk_sizes.to(device)
            beacon_sizes = torch.tensor(beacon_sizes)
            beacon_sizes = beacon_sizes.to(device)
            nbr_idx = pad_to_equal_length(nbr_idx)
            nbr_idx = torch.tensor(nbr_idx)
            nbr_idx = nbr_idx.to(device)
            nbr_idx = nbr_idx.to(torch.int64)
            node_labels = torch.tensor(node_labels)
            node_labels = node_labels.to(device)
            node_labels = node_labels.to(torch.int64)
            # Add preprocessed data
            chunk_sizes = chunk_sizes.unsqueeze(0)
            beacon_sizes = beacon_sizes.unsqueeze(0)
            nbr_idx = nbr_idx.unsqueeze(0)
            node_labels = node_labels.unsqueeze(0)
            ######
            print(f"Prompt has {input_ids.shape[-1]} tokens")

            start_time = time.time()

            if hasattr(model, "memory") and model.memory is not None:
                model.memory.reset()

            outputs = model.generate(
                input_ids,
                max_new_tokens=max_tokens,
                num_beams=1,
                do_sample=False,
                temperature=1.0,
                chunk_sizes=chunk_sizes,
                beacon_sizes=beacon_sizes,
                nbr_idx=nbr_idx,
                node_labels=node_labels,
                pad_token_id=tokenizer.eos_token_id
            )
            model_answer = tokenizer.decode(outputs[0,input_ids.shape[1]:].cpu())

            end_time = time.time()
            runtime = end_time - start_time

            # Check output format
            if DEBUG:
                question = eg["question"]

            node_num = nbr_idx.shape[1] - 1
            preds.append(
                {
                    "id": i,
                    "prediction": model_answer,
                    "ground_truth": answer,
                    "tokens": input_ids.shape[-1],
                    "node_num": node_num,
                    "time": runtime,
                }
            )

            if DEBUG and i % 40 == 39: # Periodically check output
                print(preds)
                break

            dump_jsonl(preds, output_path)

        ##################
        # Compute metrics
        y_pred_list = []
        y_true_list = []
        acc_cnt = 0

        for item in preds:
            y_pred, y_true = item["prediction"], item["ground_truth"]
            y_true_item = y_true.split(":")[-1].lower()

            if y_pred == "<|im_end|>":
                continue

            y_pred_list.append(y_pred)
            y_true_list.append(y_true)

            if y_true_item in y_pred.lower():
                acc_cnt += 1
            else:
                pass
        print(f"===== Acc: {acc_cnt / len(y_pred_list)} =====")
        ##################