nlp.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. from .imports import *
  2. from .torch_imports import *
  3. from .core import *
  4. from .model import *
  5. from .dataset import *
  6. from .learner import *
  7. from .text import *
  8. from .lm_rnn import *
  9. from sklearn.feature_extraction.text import CountVectorizer
  10. from sklearn.model_selection import train_test_split
  11. from torchtext.datasets import language_modeling
  12. class DotProdNB(nn.Module):
  13. def __init__(self, nf, ny, w_adj=0.4, r_adj=10):
  14. super().__init__()
  15. self.w_adj,self.r_adj = w_adj,r_adj
  16. self.w = nn.Embedding(nf+1, 1, padding_idx=0)
  17. self.w.weight.data.uniform_(-0.1,0.1)
  18. self.r = nn.Embedding(nf+1, ny)
  19. def forward(self, feat_idx, feat_cnt, sz):
  20. w = self.w(feat_idx)
  21. r = self.r(feat_idx)
  22. x = ((w+self.w_adj)*r/self.r_adj).sum(1)
  23. return F.softmax(x)
  24. class SimpleNB(nn.Module):
  25. def __init__(self, nf, ny):
  26. super().__init__()
  27. self.r = nn.Embedding(nf+1, ny, padding_idx=0)
  28. self.b = nn.Parameter(torch.zeros(ny,))
  29. def forward(self, feat_idx, feat_cnt, sz):
  30. r = self.r(feat_idx)
  31. x = r.sum(1)+self.b
  32. return F.softmax(x)
  33. class BOW_Learner(Learner):
  34. def __init__(self, data, models, **kwargs):
  35. super().__init__(data, models, **kwargs)
  36. def _get_crit(self, data): return F.l1_loss
  37. def calc_pr(y_i, x, y, b):
  38. idx = np.argwhere((y==y_i)==b)
  39. ct = x[idx[:,0]].sum(0)+1
  40. tot = ((y==y_i)==b).sum()+1
  41. return ct/tot
  42. def calc_r(y_i, x, y):
  43. return np.log(calc_pr(y_i, x, y, True) / calc_pr(y_i, x, y, False))
  44. class BOW_Dataset(Dataset):
  45. def __init__(self, bow, y, max_len):
  46. self.bow,self.max_len = bow,max_len
  47. self.c = int(y.max())+1
  48. self.n,self.vocab_size = bow.shape
  49. self.y = one_hot(y,self.c).astype(np.float32)
  50. x = self.bow.sign()
  51. self.r = np.stack([calc_r(i, x, y).A1 for i in range(self.c)]).T
  52. def __getitem__(self, i):
  53. row = self.bow.getrow(i)
  54. num_row_entries = row.indices.shape[0]
  55. indices = (row.indices + 1).astype(np.int64)
  56. data = (row.data).astype(np.int64)
  57. if num_row_entries < self.max_len:
  58. # If short, pad
  59. indices = np.pad(indices, (self.max_len - num_row_entries, 0), mode='constant')
  60. data = np.pad(data, (self.max_len - num_row_entries, 0), mode='constant')
  61. else:
  62. # If long, truncate
  63. indices, data = indices[-self.max_len:], data[-self.max_len:]
  64. return indices, data, min(self.max_len, num_row_entries), self.y[i]
  65. def __len__(self): return len(self.bow.indptr)-1
  66. class TextClassifierData(ModelData):
  67. @property
  68. def c(self): return self.trn_ds.c
  69. @property
  70. def r(self):
  71. return torch.Tensor(np.concatenate([np.zeros((1,self.c)), self.trn_ds.r]))
  72. def get_model(self, f, **kwargs):
  73. m = to_gpu(f(self.trn_ds.vocab_size, self.c, **kwargs))
  74. m.r.weight.data = to_gpu(self.r)
  75. m.r.weight.requires_grad = False
  76. model = BasicModel(m)
  77. return BOW_Learner(self, model, metrics=[accuracy_thresh(0.5)], opt_fn=optim.Adam)
  78. def dotprod_nb_learner(self, **kwargs): return self.get_model(DotProdNB, **kwargs)
  79. def nb_learner(self, **kwargs): return self.get_model(SimpleNB, **kwargs)
  80. @classmethod
  81. def from_bow(cls, trn_bow, trn_y, val_bow, val_y, sl):
  82. trn_ds = BOW_Dataset(trn_bow, trn_y, sl)
  83. val_ds = BOW_Dataset(val_bow, val_y, sl)
  84. trn_dl = DataLoader(trn_ds, 64, True)
  85. val_dl = DataLoader(val_ds, 64, False)
  86. return cls('.', trn_dl, val_dl)
  87. def flip_tensor(x, dim):
  88. xsize = x.size()
  89. dim = x.dim() + dim if dim < 0 else dim
  90. x = x.view(-1, *xsize[dim:])
  91. x = x.view(x.size(0), x.size(1), -1)[:, getattr(torch.arange(x.size(1)-1,
  92. -1, -1), ('cpu','cuda')[x.is_cuda])().long(), :]
  93. return x.view(xsize)
  94. class LanguageModelLoader():
  95. def __init__(self, ds, bs, bptt, backwards=False):
  96. self.bs,self.bptt,self.backwards = bs,bptt,backwards
  97. text = sum([o.text for o in ds], [])
  98. fld = ds.fields['text']
  99. nums = fld.numericalize([text],device=None if torch.cuda.is_available() else -1)
  100. self.data = self.batchify(nums)
  101. self.i,self.iter = 0,0
  102. self.n = len(self.data)
  103. def __iter__(self):
  104. self.i,self.iter = 0,0
  105. return self
  106. def __len__(self): return self.n // self.bptt - 1
  107. def __next__(self):
  108. if self.i >= self.n-1 or self.iter>=len(self): raise StopIteration
  109. bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
  110. seq_len = max(5, int(np.random.normal(bptt, 5)))
  111. res = self.get_batch(self.i, seq_len)
  112. self.i += seq_len
  113. self.iter += 1
  114. return res
  115. def batchify(self, data):
  116. nb = data.size(0) // self.bs
  117. data = data[:nb*self.bs]
  118. data = data.view(self.bs, -1).t().contiguous()
  119. if self.backwards: data=flip_tensor(data, 0)
  120. return to_gpu(data)
  121. def get_batch(self, i, seq_len):
  122. source = self.data
  123. seq_len = min(seq_len, len(source) - 1 - i)
  124. return source[i:i+seq_len], source[i+1:i+1+seq_len].view(-1)
  125. class RNN_Learner(Learner):
  126. def __init__(self, data, models, **kwargs):
  127. super().__init__(data, models, **kwargs)
  128. def _get_crit(self, data): return F.cross_entropy
  129. def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))
  130. def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))
  131. class ConcatTextDataset(torchtext.data.Dataset):
  132. def __init__(self, path, text_field, newline_eos=True, encoding='utf-8', **kwargs):
  133. fields = [('text', text_field)]
  134. text = []
  135. if os.path.isdir(path): paths=glob(f'{path}/*.*')
  136. else: paths=[path]
  137. for p in paths:
  138. for line in open(p, encoding=encoding): text += text_field.preprocess(line)
  139. if newline_eos: text.append('<eos>')
  140. examples = [torchtext.data.Example.fromlist([text], fields)]
  141. super().__init__(examples, fields, **kwargs)
  142. class ConcatTextDatasetFromDataFrames(torchtext.data.Dataset):
  143. def __init__(self, df, text_field, col, newline_eos=True, **kwargs):
  144. fields = [('text', text_field)]
  145. text = []
  146. text += text_field.preprocess(df[col].str.cat(sep=' <eos> '))
  147. if (newline_eos): text.append('<eos>')
  148. examples = [torchtext.data.Example.fromlist([text], fields)]
  149. super().__init__(examples, fields, **kwargs)
  150. @classmethod
  151. def splits(cls, train_df=None, val_df=None, test_df=None, keep_nones=False, **kwargs):
  152. res = (
  153. cls(train_df, **kwargs),
  154. cls(val_df, **kwargs),
  155. map_none(test_df, partial(cls, **kwargs))) # not required
  156. return res if keep_nones else tuple(d for d in res if d is not None)
  157. class LanguageModelData():
  158. """
  159. This class provides the entry point for dealing with supported NLP tasks.
  160. Usage:
  161. 1. Use one of the factory constructors (from_dataframes, from_text_files) to
  162. obtain an instance of the class.
  163. 2. Use the get_model method to return a RNN_Learner instance (a network suited
  164. for NLP tasks), then proceed with training.
  165. Example:
  166. >> TEXT = data.Field(lower=True, tokenize=spacy_tok)
  167. >> FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
  168. >> md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=64, bptt=70, min_freq=10)
  169. >> em_sz = 200 # size of each embedding vector
  170. >> nh = 500 # number of hidden activations per layer
  171. >> nl = 3 # number of layers
  172. >> opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
  173. >> learner = md.get_model(opt_fn, em_sz, nh, nl,
  174. dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
  175. >> learner.reg_fn = seq2seq_reg
  176. >> learner.clip=0.3
  177. >> learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)
  178. """
  179. def __init__(self, path, field, trn_ds, val_ds, test_ds, bs, bptt, backwards=False, **kwargs):
  180. """ Constructor for the class. An important thing that happens here is
  181. that the field's "build_vocab" method is invoked, which builds the vocabulary
  182. for this NLP model.
  183. Also, three instances of the LanguageModelLoader are constructed; one each
  184. for training data (self.trn_dl), validation data (self.val_dl), and the
  185. testing data (self.test_dl)
  186. Args:
  187. path (str): testing path
  188. field (Field): torchtext field object
  189. trn_ds (Dataset): training dataset
  190. val_ds (Dataset): validation dataset
  191. test_ds (Dataset): testing dataset
  192. bs (int): batch size
  193. bptt (int): back propagation through time
  194. kwargs: other arguments
  195. """
  196. self.bs = bs
  197. self.path = path
  198. self.trn_ds = trn_ds; self.val_ds = val_ds; self.test_ds = test_ds
  199. if not hasattr(field, 'vocab'): field.build_vocab(self.trn_ds, **kwargs)
  200. self.pad_idx = field.vocab.stoi[field.pad_token]
  201. self.nt = len(field.vocab)
  202. factory = lambda ds: LanguageModelLoader(ds, bs, bptt, backwards=backwards)
  203. self.trn_dl = factory(self.trn_ds)
  204. self.val_dl = factory(self.val_ds)
  205. self.test_dl = map_none(self.test_ds, factory) # not required
  206. def get_model(self, opt_fn, emb_sz, n_hid, n_layers, **kwargs):
  207. """ Method returns a RNN_Learner object, that wraps an instance of the RNN_Encoder module.
  208. Args:
  209. opt_fn (Optimizer): the torch optimizer function to use
  210. emb_sz (int): embedding size
  211. n_hid (int): number of hidden inputs
  212. n_layers (int): number of hidden layers
  213. kwargs: other arguments
  214. Returns:
  215. An instance of the RNN_Learner class.
  216. """
  217. m = get_language_model(self.nt, emb_sz, n_hid, n_layers, self.pad_idx, **kwargs)
  218. model = SingleModel(to_gpu(m))
  219. return RNN_Learner(self, model, opt_fn=opt_fn)
  220. @classmethod
  221. def from_dataframes(cls, path, field, col, train_df, val_df, test_df=None, bs=64, bptt=70, **kwargs):
  222. trn_ds, val_ds, test_ds = ConcatTextDatasetFromDataFrames.splits(
  223. text_field=field, col=col, train_df=train_df, val_df=val_df, test_df=test_df, keep_nones=True)
  224. return cls(path, field, trn_ds, val_ds, test_ds, bs, bptt, **kwargs)
  225. @classmethod
  226. def from_text_files(cls, path, field, train, validation, test=None, bs=64, bptt=70, **kwargs):
  227. """ Method used to instantiate a LanguageModelData object that can be used for a
  228. supported nlp task.
  229. Args:
  230. path (str): the absolute path in which temporary model data will be saved
  231. field (Field): torchtext field
  232. train (str): file location of the training data
  233. validation (str): file location of the validation data
  234. test (str): file location of the testing data
  235. bs (int): batch size to use
  236. bptt (int): back propagation through time hyper-parameter
  237. kwargs: other arguments
  238. Returns:
  239. a LanguageModelData instance, which most importantly, provides us the datasets for training,
  240. validation, and testing
  241. Note:
  242. The train, validation, and test path can be pointed to any file (or folder) that contains a valid
  243. text corpus.
  244. """
  245. trn_ds, val_ds, test_ds = ConcatTextDataset.splits(
  246. path, text_field=field, train=train, validation=validation, test=test)
  247. return cls(path, field, trn_ds, val_ds, test_ds, bs, bptt, **kwargs)
  248. class TextDataLoader():
  249. def __init__(self, src, x_fld, y_fld):
  250. self.src,self.x_fld,self.y_fld = src,x_fld,y_fld
  251. def __len__(self): return len(self.src)
  252. def __iter__(self):
  253. it = iter(self.src)
  254. for i in range(len(self)):
  255. b = next(it)
  256. yield getattr(b, self.x_fld).data, getattr(b, self.y_fld).data
  257. class TextModel(BasicModel):
  258. def get_layer_groups(self):
  259. m = self.model[0]
  260. return [(m.encoder, m.dropouti), *zip(m.rnns, m.dropouths), (self.model[1])]
  261. class TextData(ModelData):
  262. def create_td(self, it): return TextDataLoader(it, self.text_fld, self.label_fld)
  263. @classmethod
  264. def from_splits(cls, path, splits, bs, text_name='text', label_name='label'):
  265. text_fld = splits[0].fields[text_name]
  266. label_fld = splits[0].fields[label_name]
  267. if hasattr(label_fld, 'build_vocab'): label_fld.build_vocab(splits[0])
  268. iters = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
  269. trn_iter,val_iter,test_iter = iters[0],iters[1],None
  270. test_dl = None
  271. if len(iters) == 3:
  272. test_iter = iters[2]
  273. test_dl = TextDataLoader(test_iter, text_name, label_name)
  274. trn_dl = TextDataLoader(trn_iter, text_name, label_name)
  275. val_dl = TextDataLoader(val_iter, text_name, label_name)
  276. obj = cls.from_dls(path, trn_dl, val_dl, test_dl)
  277. obj.bs = bs
  278. obj.pad_idx = text_fld.vocab.stoi[text_fld.pad_token]
  279. obj.nt = len(text_fld.vocab)
  280. obj.c = (len(label_fld.vocab) if hasattr(label_fld, 'vocab')
  281. else len(getattr(splits[0][0], label_name)))
  282. return obj
  283. def to_model(self, m, opt_fn):
  284. model = TextModel(to_gpu(m))
  285. return RNN_Learner(self, model, opt_fn=opt_fn)
  286. def get_model(self, opt_fn, max_sl, bptt, emb_sz, n_hid, n_layers, dropout, **kwargs):
  287. m = get_rnn_classifier(bptt, max_sl, self.c, self.nt,
  288. layers=[emb_sz*3, self.c], drops=[dropout],
  289. emb_sz=emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=self.pad_idx, **kwargs)
  290. return self.to_model(m, opt_fn)