In [None]:
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F

### I. Representing Text
In NLP, we are dealing with words and phrases which are discrete features. How do we represent them in a way that a neural network can easily process?

#### 1. Bag of Words
Bag-of-words is a conventional way to represent documents before deep learning. The idea is to represent each document as the count / frequency of each word in the vocabulary.

Search for TF-IDF and n-gram models if you want to know more.

In [None]:
sentence = "the quick brown fox jumps over the lazy dog ."
# here we are assuming tokens are space-separated
# for real applications, we often use more complex tokenizers to split the raw text
tokens = sentence.split()
print(tokens)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
idx2token = list(set(token for token in tokens))
token2idx = dict((t, i) for (i, t) in enumerate(idx2token))
vocab_size = len(idx2token)
print(token2idx)

{'fox': 0, 'quick': 1, 'the': 2, 'lazy': 3, '.': 4, 'dog': 5, 'jumps': 6, 'over': 7, 'brown': 8}


In [None]:
counter = Counter(tokens)
bow = torch.tensor([counter[token] for token in idx2token])
print(bow)

tensor([1, 1, 2, 1, 1, 1, 1, 1, 1])


The drawbacks of a BoW include:
- the representation of a document is huge with a large vocabulary
- each document is a single count vector which is limited in many applications
- words are treated as independent

#### 2. One Hot Encoding
Another approach closely related to BoW is one hot encoding. It represents each word as an indicator vector.

In [None]:
token_ids = torch.tensor([token2idx[token] for token in tokens])
print(F.one_hot(token_ids, num_classes=vocab_size))

tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0]])


Similar to BoW, the drawbacks are:
- with a large vocabulary, the representation is huge.
- it treats all words as independent with no relations to each other.

#### 3. Dense Word Embeddings
Word embeddings keep a look up table for each word in the vocabulary. Each sentence is represented as a sequence of word embeddings.

The embedding parameters are adjusted together with all other model parameters during training. Or (less common) you can fix the parameters to some pretrained values (e.g. word2vec).

In [None]:
embed_size = 8
embeds = nn.Embedding(vocab_size, embed_size)
embeddings = embeds(token_ids)
# (seqence_length, embedding_size)
print(embeddings.shape)
print(embeddings)

torch.Size([10, 8])
tensor([[ 1.6840, -0.0304, -0.3668, 0.6942, -0.2271, 0.9454, 0.3811, -1.2579],
 [-0.7907, -0.6264, 0.4665, 1.9389, 0.8427, 0.8852, -1.2025, -0.0106],
 [ 1.9983, 0.2476, 0.7603, -1.0315, 0.4166, -0.5394, 1.6762, -1.2370],
 [-0.4871, 1.2124, 0.5749, 0.5276, 0.9060, 0.2522, -0.1403, 0.4153],
 [ 0.6686, -1.2796, 0.2280, 1.4104, 0.7402, -0.2454, 0.5503, 0.0655],
 [-0.4550, 0.1212, -0.0990, 0.9856, -0.4488, 0.0389, 0.2322, 0.0431],
 [ 1.6840, -0.0304, -0.3668, 0.6942, -0.2271, 0.9454, 0.3811, -1.2579],
 [ 0.9182, 0.9667, 0.3924, 0.6065, 1.6190, 0.2829, -2.7679, -0.8704],
 [ 0.1241, 0.2837, -0.9605, -1.1846, 0.1741, 0.9107, -1.0985, 2.3858],
 [ 1.6951, 0.6644, -0.3910, -0.7861, 0.3688, 0.4912, 0.2069, -0.2365]],
 grad_fn=)


#### 4. Representing Batches of Sentences
Sentences are not guaranteed to have the same number of words, how to represent them as a batch?

In [None]:
idx2token = ['', ''] + idx2token
token2idx = dict((t, i) for (i, t) in enumerate(idx2token))
print(token2idx)

{'': 0, '': 1, 'fox': 2, 'quick': 3, 'the': 4, 'lazy': 5, '.': 6, 'dog': 7, 'jumps': 8, 'over': 9, 'brown': 10}


In [None]:
sentences = [
 "the quick brown fox jumps over the lazy dog .",
 "the dog is lying beside the fox",
 ]

# For each word in the sentence, it has a part-of-speech (POS) category.
# For the meaning of the common POS label, we can refer to https://www.sketchengine.eu/penn-treebank-tagset/
pos_tags = [
 "DT ADJ ADJ NN VB IN DT ADJ NN .",
 "DT NN VB VB IN DT NN"
]
batch_tokens = [sent.split() for sent in sentences]
batch_labels = [tags.split() for tags in pos_tags]
print(batch_tokens)

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.'], ['the', 'dog', 'is', 'lying', 'beside', 'the', 'fox']]


In [None]:
token_ids_0 = [token2idx[token] for token in batch_tokens[0]]

In [None]:
# there might be words not present in the vocabulary
token_ids_1 = [token2idx[token] for token in batch_tokens[1]]

KeyError: ignored

In [None]:
# default unseen words to the token
token_ids_1 = [token2idx.get(token, token2idx['']) for token in batch_tokens[1]]
print(token_ids_1)

[4, 7, 1, 1, 1, 4, 2]


In [None]:
max_length = max([len(ids) for ids in [token_ids_0, token_ids_1]])
batch_ids = torch.tensor(
 [
 token_ids_0 + [token2idx['']] * (max_length - len(token_ids_0)),
 token_ids_1 + [token2idx['']] * (max_length - len(token_ids_1)),
 ]
)
print(batch_ids)

tensor([[ 4, 3, 10, 2, 8, 9, 4, 5, 7, 6],
 [ 4, 7, 1, 1, 1, 4, 2, 0, 0, 0]])


In [None]:
embed_size = 8
embeds = nn.Embedding(len(idx2token), embed_size)
batch_embeds = embeds(batch_ids)
print(batch_embeds.shape)
print(batch_embeds)

torch.Size([2, 10, 8])
tensor([[[ 7.8184e-01, -1.3621e+00, 4.2569e-01, -1.8461e+00, 2.1676e+00,
 -9.1152e-01, -9.6761e-01, 2.3834e-01],
 [-6.3344e-01, -4.9714e-01, -4.8708e-01, 6.8581e-01, -1.7923e-01,
 8.5219e-02, 8.8637e-01, -4.0823e-01],
 [-9.2896e-01, -2.6551e-01, -1.7135e+00, -3.6573e-01, -1.0628e+00,
 -1.8727e-01, 1.6141e+00, 1.0059e+00],
 [ 9.2869e-01, -1.9563e+00, 5.5091e-01, -3.0735e-01, -1.6190e+00,
 1.5873e-01, 1.1222e+00, -1.1373e+00],
 [ 2.2582e+00, -2.5128e+00, 6.3548e-01, 1.7982e-01, -3.6781e-01,
 -6.6681e-01, 9.5394e-01, 1.4822e-01],
 [ 6.1321e-01, -1.0435e-01, -2.0886e+00, -1.2996e-01, -4.2351e-02,
 8.4045e-01, -1.5285e+00, 8.9550e-02],
 [ 7.8184e-01, -1.3621e+00, 4.2569e-01, -1.8461e+00, 2.1676e+00,
 -9.1152e-01, -9.6761e-01, 2.3834e-01],
 [ 1.4870e-01, 1.0553e+00, -1.8828e+00, -1.0740e+00, -4.0737e-01,
 -7.8651e-01, 1.4763e+00, 4.4058e-01],
 [-1.0930e+00, -9.9038e-01, -9.3767e-01, -6.1010e-01, -4.0999e-01,
 8.9367e-02, 1.3753e+00, 9.0367e-01],
 [ 1.5305e+00, 4.4224e-

In practice, there are two padding strategies:
- pad or truncate all sentences to the same length
- dynamically pad to the maximum length within a batch

The first strategy is easier to implement and suitable when the variation of lengths is small within the data set.

The second is more efficient and can be faster in training if the sentence lengths varies a lot. This strategy can be further optimized by constructing batches with similar lengths.

### II. LSTM / Transformer
Given the embeddings, we can feed them into an LSTM or transformer model.

For LSTM, the input is a tensor of size (batch_size, seq_length, input_dim) and the optional hidden state and cell state. The default value for hidden state and cell state is zero.

doc: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html


In [None]:
hidden_size = 16
lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
output, (hn, cn) = lstm(batch_embeds)

In [None]:
# output is the output from the last layer
# (batch_size, seq_length, hidden_size)
output.shape

torch.Size([2, 10, 16])

In [None]:
# For bi-LSTM, we only need to set bidirectional=True.
bilstm = nn.LSTM(embed_size, hidden_size, batch_first=True, bidirectional=True)
bioutput, (hn, cn) = bilstm(batch_embeds)

# the last dim of the output will be the concatenation of the forward and reverse hidden states at each time step
# (batch_size, seq_length, 2 * hidden_size)
bioutput.shape

torch.Size([2, 10, 32])

In [None]:
# for sequence tagging, make one prediction for each token
# suppose we are doing sequence labeling task with 4 categories
num_classes = 4
linear = nn.Linear(hidden_size, num_classes)

# (batch_size, seq_length, num_classes)
logits = linear(output)
print(logits.shape)

torch.Size([2, 10, 4])


In [None]:
# for sequence classification, aggregate the sequence dimension first
# (batch_size, num_classes)
logits = linear(output.mean(dim=1))
print(logits.shape)

torch.Size([2, 4])


For transformer, PyTorch provides several modules:

* nn.Transformer: encoder + decoder
* nn.TransformerEncoder: a stack of N encoder layers
* nn.TransformerDecoder: a stack of N decoder layers
* nn.TransformerEncoderLayer: self-attn and feedforward network.
* nn.TransformerDecoderLayer: self-attn, multi-head-attn (for encoder-decoder) and feedforward network.

doc: https://pytorch.org/docs/stable/nn.html#transformer-layers

In [None]:
num_heads = 4
hidden_size = 16
dropout = 0.1
layers = nn.TransformerEncoderLayer(
 embed_size, num_heads, hidden_size, dropout, batch_first=True)
num_layers = 2
transformer = nn.TransformerEncoder(layers, num_layers)

In [None]:
transformer.eval()
output = transformer(batch_embeds)
print(output.shape)

torch.Size([2, 10, 8])


For Transformer models, if the input includes padded tokens, we need to mask those parts out to get an accurate output that only depends on the unpadded tokens.

In [None]:
padding_mask = (batch_ids == token2idx[''])
print(padding_mask)

tensor([[False, False, False, False, False, False, False, False, False, False],
 [False, False, False, False, False, False, False, True, True, True]])


In [None]:
output_masked = transformer(
 batch_embeds, 
 src_key_padding_mask=padding_mask
)
print(output_masked.shape)

torch.Size([2, 10, 8])


In [None]:
is_equal = (output == output_masked)
# no padding for the first example
print(is_equal[0].all())
# with padding for the second example
# providing the padding mask leads to completely different output
print(is_equal[1].any())

tensor(True)
tensor(False)


Note the difference between `mask` and `src_key_padding_mask` when using `nn.TransformerEncoder`.

`src_key_padding_mask` is used as illustrated above. The mask is of shape `(batch_size, sequence_length)` indicating whether the input token is padded.

`mask` is completely different. It is used to prevent looking into future tokens in sequence generation tasks. The shape is `(seq_length, seq_length)`.

If we are dealing with a sequence tagging task, we also need to pad the labels.

In [None]:
idx2tag = ["", "DT", "NN", "ADJ", "VB", "IN", "."]
tag2idx = dict((t, i) for (i, t) in enumerate(idx2tag))

In [None]:
max_length = max([len(labels) for labels in batch_labels])
batch_targets = torch.tensor(
 [
 [tag2idx[tag] for tag in labels] + [tag2idx[""]] * (max_length - len(labels)) for labels in batch_labels
 ]
)
print(batch_targets)

tensor([[1, 3, 3, 2, 4, 5, 1, 3, 2, 6],
 [1, 2, 4, 4, 5, 1, 2, 0, 0, 0]])


In [None]:
num_classes = len(idx2tag)
linear = nn.Linear(embed_size, num_classes)

In [None]:
logits = linear(output_masked)
print(logits.shape)

torch.Size([2, 10, 7])


In [None]:
loss = F.cross_entropy(logits.view(-1, num_classes), batch_targets.view(-1))
print(loss)

tensor(1.9610, grad_fn=)


In [None]:
# loss calculated from padded labels are ignored
# we do not care what predictions are made on padded tokens because we will discard them anyway.
loss_ignore_padding = F.cross_entropy(logits.view(-1, num_classes), batch_targets.view(-1), ignore_index=tag2idx[""])
print(loss_ignore_padding)

tensor(1.8112, grad_fn=)


In [None]:
# we can achive the same thing by masking out the padded tokens
active_mask = torch.logical_not(padding_mask)
loss_ignore_padding_v2 = F.cross_entropy(logits[active_mask], batch_targets[active_mask])
print(loss_ignore_padding_v2)

tensor(1.8112, grad_fn=)


### Other resources

Classifying Names with a Character-Level RNN: https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html

Translation with a Sequence to Sequence Network and Attention: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

LSTM (character + word) POS-tag model PyTorch: https://www.kaggle.com/code/krishanudb/lstm-character-word-pos-tag-model-pytorch

PyTorch POS Tagging: https://github.com/bentrevett/pytorch-pos-tagging