EnhancerMatcher/OneNucleotideIndexer.py at main · BioinformaticsToolsmith/EnhancerMatcher · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
### Created on Jan 15, 2024
### @author: Dr. Hani Z. Girgis, Bioinformatics Toolsmith Laboratory, Texas A&M University-Kingsville

import numpy as np
import multiprocessing
import math

class OneNucleotideIndexer:
    def __init__(self, max_len, another_indexer=None):
        assert max_len > 0, f'The maximum length ({max_len}) must be positive'

        self.max_len = max_len
        if another_indexer:
            self.index_table = another_indexer.get_index_table()
        else:
            self.index_table = {}

        self.pad = 0
        self.oov = 1

    def get_index_table(self):
        return self.index_table

    def update_table(self, rec_list):
        '''
        Assing each nucleotide a unique index
        '''
        count = 0
        for a_rec in rec_list:
            if count % 100000 == 0:
                print('.', end='')
            count += 1

            for a_token in list(a_rec.seq.upper()):
                if a_token not in self.index_table:
                    self.index_table[a_token] = len(self.index_table) + 2

    def get_vocabulary_size(self):
        '''
        Get the number of tokens
        This number include the padding token and the oov token
        '''
        return len(self.index_table) + 2

    def encode_list(self, a_rec_list):
        '''
        Encode a list of records
        '''
        r = self.pad * np.ones((len(a_rec_list), self.max_len), dtype=np.int8)
        for i, a_rec in enumerate(a_rec_list):
            if i % 100000 == 0:
                print('.', end='')

            l = [self.index_table.get(char, self.oov) for char in a_rec.seq.upper()]
            r[i, 0:len(l)] = np.array(l, dtype=np.int8)
        return r


    def encode_list_parallel(self, a_rec_list):
        '''
        This method is generated by ChatGPT according to HZG's detailed instructions
        '''
        # Function to divide the list into chunks
        def divide_chunks(lst, n):
            for i in range(0, len(lst), n):
                yield lst[i:i + n]

        # Determine the number of available cores
        num_cores = multiprocessing.cpu_count()

        # Divide the list into approximately equal chunks
        chunk_size = math.ceil(len(a_rec_list) / num_cores)
        chunks = list(divide_chunks(a_rec_list, chunk_size))

        # Use multiprocessing Pool
        pool = multiprocessing.Pool(processes=num_cores)

        # Map process_chunk to each chunk
        results = pool.map(self.encode_list, chunks)

        # Closse and wait
        pool.close()
        pool.join()

        # Combine the results
        combined_r = np.concatenate(results, axis=0)

        return combined_r