pack_ padded_ Sequence is to record the word of each sentence according to the principle of batch first, and change it into a tensor of indefinite length, which is convenient to calculate the loss function

pad_ packed_ Sequence is to add a pack_ padded_ The structure generated by sequence is transformed into the original structure, which is a constant length tensor

The content of test.txt

``````As they sat in a nice coffee shop,
he was too nervous to say anything and she felt uncomfortable.
"Could you please give me some salt?I'd like to put it in my coffee."``````

See the following code for details

``````import torch
import torch.nn as nn
import numpy as np
import wordfreq

vocab = {}
token_id = 1
lengths = []

with open('test.txt', 'r') as f:
for line in lines:
tokens = wordfreq.tokenize(line.strip(), 'en')
lengths.append(len(tokens))
#Add each word to the vocab and save the corresponding index at the same time
for word in tokens:
if word not in vocab:
vocab[word] = token_id
token_id += 1

x = np.zeros((len(lengths), max(lengths)))
l_no = 0
#Converting words to numbers
with open('test.txt', 'r') as f:
for line in lines:
tokens = wordfreq.tokenize(line.strip(), 'en')
for i in range(len(tokens)):
x[l_no, i] = vocab[tokens[i]]
l_no += 1

x=torch.Tensor(x)
x = Variable(x)
print(x)
'''
tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  0.,  0.,  0.,  0.,  0.,  0.],
[ 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,  0.,  0.,  0.],
[20.,  9., 21., 22., 23.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
[24., 25., 26., 27., 28., 29., 30., 31., 32., 13., 33., 34.,  4.,  7.]])
'''
lengths = torch.Tensor(lengths)
print(lengths)#tensor([ 8., 11.,  5., 14.])

_, idx_sort = torch.sort(torch.Tensor(lengths), dim=0, descending=True)
print(_) #tensor([14., 11.,  8.,  5.])
print(idx_sort)#tensor([3, 1, 0, 2])

lengths = list(lengths[idx_sort])#Fetch elements by subscript [tensor(14.), tensor(11.), tensor(8.), tensor(5.)]
t = x.index_select(0, idx_sort)#Fetch elements by subscript
print(t)
'''
tensor([[24., 25., 26., 27., 28., 29., 30., 31., 32., 13., 33., 34.,  4.,  7.],
[ 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,  0.,  0.,  0.],
[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  0.,  0.,  0.,  0.,  0.,  0.],
[20.,  9., 21., 22., 23.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])
'''
print(x_packed)
'''
PackedSequence(data=tensor([24.,  9.,  1., 20., 25., 10.,  2.,  9., 26., 11.,  3., 21., 27., 12.,
4., 22., 28., 13.,  5., 23., 29., 14.,  6., 30., 15.,  7., 31., 16.,
8., 32., 17., 13., 18., 33., 19., 34.,  4.,  7.]), batch_sizes=tensor([4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1]))
'''

'''
(tensor([[24., 25., 26., 27., 28., 29., 30., 31., 32., 13., 33., 34.,  4.,  7.],
[ 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,  0.,  0.,  0.],
[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  0.,  0.,  0.,  0.,  0.,  0.],
[20.,  9., 21., 22., 23.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]), tensor([14, 11,  8,  5]))
'''
_, idx_unsort = torch.sort(idx_sort)