-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_pyloader_py.py
More file actions
151 lines (134 loc) · 5.53 KB
/
test_pyloader_py.py
File metadata and controls
151 lines (134 loc) · 5.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# -*- coding: utf-8 -*-
from pyloader import DataReader
from pyloader import Dataset
from pyloader import DataCollator
from pyloader import DataLoader
import json
import torch
# Step 1: Inherit the class `DataReader` and implement the method `read_file()`
class JsonLineDataReader(DataReader):
"""Data reader for reading files with the format of json line."""
def __init__(self, batch_size=4, shuffle=False, random_seed=42):
super(JsonLineDataReader, self).__init__(
batch_size,
shuffle=shuffle,
random_seed=random_seed
)
def read_file(self, path):
"""
Read files from disk.
Args:
path (string): the file path.
Returns:
dict_samples (Dict[str, List]): input samples, type: a dict of lists.
n_sample (int): the number of samples.
"""
print("Reading data from [{}]".format(path))
dict_samples = {}
n_sample = 0
with open(path, 'r', encoding='utf-8') as f:
for line in f:
json_sample = json.loads(line)
n_sample += 1
for k, v in json_sample.items():
if k == "post" or k == "response":
if not k in dict_samples:
dict_samples[k] = [v]
else:
dict_samples[k].append(v)
print("Total data: {}".format(n_sample))
return dict_samples, n_sample
# Step 2: Inherit the class `Dataset` and implement the method `batch_encode()`
class TextDataset(Dataset):
"""Dataset wrapping for text."""
def __init__(self, vocab_file=None, max_seq_len=100, max_vocab_size=30000):
self.max_seq_len = max_seq_len
self.max_vocab_size = max_vocab_size
self.stoi = {}
self.itos = {}
if vocab_file is not None:
self.load_vocab(vocab_file)
def load_vocab(self, vocab_file):
with open(vocab_file, 'r', encoding='utf-8') as fr:
for idx, line in enumerate(fr):
if idx < self.max_vocab_size:
w = line.strip().split('\t')[0]
self.stoi[w] = idx
self.itos[idx] = w
assert len(self.stoi) == len(self.itos)
assert len(self.stoi) <= self.max_vocab_size
def batch_encode(self, batch_data):
"""Encode batch data from raw format to numeric format."""
post_ids = []
post_mask = []
resp_ids = []
resp_maks = []
for k, data_list in batch_data.items():
if k == "post" or k == "response":
for text in data_list:
if self.stoi:
text_ids = [self.stoi.get(w, 0) for w in text.split()]
else:
text_ids = [int(w) for w in text.split()]
text_masks = [1] * len(text_ids)
if len(text_ids) > self.max_seq_len:
input_id = text_ids[:self.max_seq_len]
input_mask = text_masks[:self.max_seq_len]
else:
input_id = text_ids + [0] * (self.max_seq_len-len(text_ids))
input_mask = text_masks + [0] * (self.max_seq_len-len(text_ids))
if k == "post":
post_ids.append(input_id)
post_mask.append(input_mask)
else:
resp_ids.append(input_id)
resp_maks.append(input_mask)
batch = {
"post_ids": post_ids,
"post_mask": post_mask,
"resp_ids": resp_ids,
"resp_maks": resp_maks
}
return batch
# Step 3: Inherit the class `DataCollator` and implement the method `collate_batch()`
class PyDataCollator(DataCollator):
"""Simple PyTorch-based data collator."""
def collate_batch(self, features):
# Special handling for labels.
# Ensure that tensor is created with the correct type
if "label" in features and features["label"] is not None:
first = features["label"][0]
if type(first) is int:
labels = torch.tensor([f for f in features["label"]], dtype=torch.long)
else:
labels = torch.tensor([f for f in features["label"]], dtype=torch.float)
batch = {"label": labels}
else:
batch = {}
# Handling of all other possible attributes
for k, v in features.items():
if k not in ("label") and v is not None:
batch[k] = torch.tensor([f for f in features[k]], dtype=torch.long)
return batch
def main_test():
data_dir = "data/chunk_files/"
vocab_file = "data/vocab.txt"
datareader = JsonLineDataReader(batch_size=16, shuffle=True)
dataset = TextDataset(vocab_file, max_seq_len=60, max_vocab_size=30000)
datacollator = PyDataCollator()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# define a DataLoader
train_loader = DataLoader(
datareader, dataset, datacollator, data_dir,
num_epoch=5, max_queue_size=10, shuffle=True
)
# iteratively load data for training
for batch_step, inputs in enumerate(train_loader):
for k, v in inputs.items():
inputs[k] = v.to(device)
# show batch data
print("step: {}\t{}:\n{}".format(batch_step, k, inputs[k]))
# define your code
# ...
if __name__ == "__main__":
main_test()