|
| 1 | +import os |
| 2 | +import pickle |
| 3 | +import numpy as np |
| 4 | +import scipy.sparse as sp |
| 5 | +import time |
| 6 | +from core.GraphDataBlock import GraphDataBlock |
| 7 | +from util.graph_utils import neighbor_sampling |
| 8 | + |
| 9 | + |
| 10 | +class EmbeddingDataSet(): |
| 11 | + """ |
| 12 | + Attributes: |
| 13 | + name (str): name of dataset |
| 14 | + data_dir (str): path to dataset folder |
| 15 | + train_dir (str): path to training data file |
| 16 | + test_dir (str): path to test data file |
| 17 | + input_dim (int): number of data features per node |
| 18 | + is_labelled (Boolean): whether underlying class labels are present |
| 19 | + all_data (list[GraphDataBlock]): data inputs packaged into blocks |
| 20 | + all_indices (np.array): input sequence when packaging data into blocks |
| 21 | +
|
| 22 | + inputs (scipy csr matrix): data feature matrix of size n x f |
| 23 | + labels (np.array): data class label matrix of size n x 1 |
| 24 | + adj_matrix (scipy csr matrix): adjacency matrix of size n x n |
| 25 | + """ |
| 26 | + |
| 27 | + train_dir = {'cora': 'cora_full.pkl'} |
| 28 | + |
| 29 | + test_dir = train_dir |
| 30 | + |
| 31 | + def __init__(self, name, data_dir, train=True): |
| 32 | + self.name = name |
| 33 | + self.data_dir = data_dir |
| 34 | + self.train_dir = EmbeddingDataSet.train_dir[name] |
| 35 | + self.test_dir = EmbeddingDataSet.test_dir[name] |
| 36 | + self.is_labelled = False |
| 37 | + |
| 38 | + self.all_data = [] |
| 39 | + |
| 40 | + # Extract data from file contents |
| 41 | + data_root = os.path.join(self.data_dir, self.name) |
| 42 | + if train: |
| 43 | + fname = os.path.join(data_root, self.train_dir) |
| 44 | + else: |
| 45 | + assert self.test_dir is not None |
| 46 | + fname = os.path.join(data_root, self.test_dir) |
| 47 | + with open(fname, 'rb') as f: |
| 48 | + file_contents = pickle.load(f) |
| 49 | + |
| 50 | + self.inputs = file_contents[0] |
| 51 | + self.labels = file_contents[1] |
| 52 | + self.adj_matrix = file_contents[2] |
| 53 | + |
| 54 | + self.is_labelled = len(self.labels) != 0 |
| 55 | + self.input_dim = self.inputs.shape[1] |
| 56 | + |
| 57 | + self.all_indices = np.arange(0, self.inputs.shape[0]) |
| 58 | + |
| 59 | + # Convert adj to csr matrix |
| 60 | + self.inputs = sp.csr_matrix(self.inputs) |
| 61 | + self.adj_matrix = sp.csr_matrix(self.adj_matrix) |
| 62 | + |
| 63 | + def create_all_data(self, n_batches=1, shuffle=False, sampling=False, full_path_matrix=None): |
| 64 | + """ |
| 65 | + Initialises all_data as a list of GraphDataBlock |
| 66 | + Args: |
| 67 | + n_batches (int): number of blocks to return |
| 68 | + shuffle (Boolean): whether to shuffle input sequence |
| 69 | + sampling (Boolean): whether to expand data blocks with neighbor sampling |
| 70 | + """ |
| 71 | + i = 0 |
| 72 | + labels_subset = [] |
| 73 | + self.all_data = [] |
| 74 | + |
| 75 | + if shuffle: |
| 76 | + np.random.shuffle(self.all_indices) |
| 77 | + else: |
| 78 | + self.all_indices = np.arange(0, self.inputs.shape[0]) |
| 79 | + |
| 80 | + # Split equally |
| 81 | + # TODO: Another option to split randomly |
| 82 | + chunk_sizes = self.get_k_equal_chunks(self.inputs.shape[0], k=n_batches) |
| 83 | + |
| 84 | + t_start = time.time() |
| 85 | + |
| 86 | + for num_samples in chunk_sizes: |
| 87 | + mask = sorted(self.all_indices[i: i + num_samples]) |
| 88 | + |
| 89 | + # Perform sampling to obtain local neighborhood of mini-batch |
| 90 | + if sampling: |
| 91 | + D_layers = [9, 14] # max samples per layer |
| 92 | + mask = neighbor_sampling(self.adj_matrix, mask, D_layers) |
| 93 | + |
| 94 | + inputs_subset = self.inputs[mask] |
| 95 | + adj_subset = self.adj_matrix[mask, :][:, mask] |
| 96 | + |
| 97 | + if self.is_labelled: |
| 98 | + labels_subset = self.labels[mask] |
| 99 | + |
| 100 | + # Package data into graph block |
| 101 | + G = GraphDataBlock(inputs_subset, labels=labels_subset, W=adj_subset) |
| 102 | + |
| 103 | + # Add original indices from the complete dataset |
| 104 | + G.original_indices = mask |
| 105 | + |
| 106 | + # Add shortest path matrix from precomputed data if needed |
| 107 | + if full_path_matrix is not None: |
| 108 | + G.precomputed_path_matrix = full_path_matrix[mask, :][:, mask] |
| 109 | + |
| 110 | + self.all_data.append(G) |
| 111 | + i += num_samples |
| 112 | + |
| 113 | + t_elapsed = time.time() - t_start |
| 114 | + print('Data blocks of length: ', [len(G.labels) for G in self.all_data]) |
| 115 | + print("Time to create all data (s) = {:.4f}".format(t_elapsed)) |
| 116 | + |
| 117 | + def summarise(self): |
| 118 | + print("Name of dataset = {}".format(self.name)) |
| 119 | + print("Input dimension = {}".format(self.input_dim)) |
| 120 | + print("Number of training samples = {}".format(self.inputs.shape[0])) |
| 121 | + print("Training labels = {}".format(self.is_labelled)) |
| 122 | + |
| 123 | + def get_k_equal_chunks(self, n, k): |
| 124 | + # returns n % k sub-arrays of size n//k + 1 and the rest of size n//k |
| 125 | + p, r = divmod(n, k) |
| 126 | + return [p + 1 for _ in range(r)] + [p for _ in range(k - r)] |
| 127 | + |
| 128 | + def get_current_inputs(self): |
| 129 | + inputs = self.inputs[self.all_indices] |
| 130 | + labels = self.labels[self.all_indices] |
| 131 | + adj = self.adj_matrix[self.all_indices, :][:, self.all_indices] |
| 132 | + return inputs, labels, adj |
| 133 | + |
| 134 | + def get_sample_block(self, n_initial, sample_neighbors, verbose=0): |
| 135 | + """ |
| 136 | + Returns a subset of data as a GraphDataBlock |
| 137 | + Args: |
| 138 | + n_initial (int): number of samples at the start |
| 139 | + sample_neighbors (Boolean): whether to expand the sample block with neighbor sampling |
| 140 | + Returns: |
| 141 | + G (GraphDataBlock): data subset |
| 142 | + """ |
| 143 | + |
| 144 | + mask = sorted(np.random.choice(self.all_indices, size=n_initial, replace=False)) |
| 145 | + if sample_neighbors: |
| 146 | + mask = neighbor_sampling(self.adj_matrix, mask, D_layers=[9, 14]) |
| 147 | + inputs = self.inputs[mask] |
| 148 | + labels = self.labels[mask] |
| 149 | + W = self.adj_matrix[mask, :][:, mask] |
| 150 | + G = GraphDataBlock(inputs, labels, W) |
| 151 | + G.original_indices = mask |
| 152 | + if verbose: |
| 153 | + print("Initial set of {} points was expanded to {} points".format(n_initial, len(mask))) |
| 154 | + return G |
0 commit comments