(原型) FX 图模式训练后动态量化¶

本教程介绍了基于 torch.fx 在图模式下进行训练后动态量化的步骤。我们有一个单独的教程用于 FX 图模式训练后静态量化，FX 图模式量化和 Eager 模式量化之间的比较可以在量化文档中找到。

简而言之；动态量化的 FX 图模式 API 如下所示

import torch
from torch.ao.quantization import default_dynamic_qconfig, QConfigMapping
# Note that this is temporary, we'll expose these functions to torch.ao.quantization after official releasee
from torch.quantization.quantize_fx import prepare_fx, convert_fx

float_model.eval()
# The old 'fbgemm' is still available but 'x86' is the recommended default.
qconfig = get_default_qconfig("x86")
qconfig_mapping = QConfigMapping().set_global(qconfig)
prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs)  # fuse modules and insert observers
# no calibration is required for dynamic quantization
quantized_model = convert_fx(prepared_model)  # convert the model to a dynamically quantized model

在本教程中，我们将对基于 LSTM 的下一个词预测模型应用动态量化，紧密遵循 PyTorch 示例中的词语语言模型。我们将从 LSTM 词语语言模型上的动态量化中复制代码并省略说明。

1. 定义模型、下载数据和模型¶

下载数据并解压到 data 文件夹

mkdir data
cd data
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
unzip wikitext-2-v1.zip

将模型下载到 data 文件夹

wget https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth

定义模型

# imports
import os
from io import open
import time
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F

# Model Definition
class LSTMModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        return decoded, hidden


def init_hidden(lstm_model, bsz):
    # get the weight tensor and create hidden layer in the same device
    weight = lstm_model.encoder.weight
    # get weight from quantized model
    if not isinstance(weight, torch.Tensor):
        weight = weight()
    device = weight.device
    nlayers = lstm_model.rnn.num_layers
    nhid = lstm_model.rnn.hidden_size
    return (torch.zeros(nlayers, bsz, nhid, device=device),
            torch.zeros(nlayers, bsz, nhid, device=device))


# Load Text Data
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens'))
        self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens'))
        self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

model_data_filepath = 'data/'

corpus = Corpus(model_data_filepath + 'wikitext-2')

ntokens = len(corpus.dictionary)

# Load Pretrained Model
model = LSTMModel(
    ntoken = ntokens,
    ninp = 512,
    nhid = 256,
    nlayers = 5,
)

model.load_state_dict(
    torch.load(
        model_data_filepath + 'word_language_model_quantize.pth',
        map_location=torch.device('cpu'),
        weights_only=True
        )
    )

model.eval()
print(model)

bptt = 25
criterion = nn.CrossEntropyLoss()
eval_batch_size = 1

# create test data set
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    return data.view(bsz, -1).t().contiguous()

test_data = batchify(corpus.test, eval_batch_size)
example_inputs = (next(iter(test_data))[0])

# Evaluation functions
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

def repackage_hidden(h):
  """Wraps hidden states in new Tensors, to detach them from their history."""

  if isinstance(h, torch.Tensor):
      return h.detach()
  else:
      return tuple(repackage_hidden(v) for v in h)

def evaluate(model_, data_source):
    # Turn on evaluation mode which disables dropout.
    model_.eval()
    total_loss = 0.
    hidden = init_hidden(model_, eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model_(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

2. 训练后动态量化¶

现在我们可以对模型进行动态量化。我们可以使用与训练后静态量化相同的函数，但使用动态 qconfig。

from torch.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig, QConfigMapping

# Full docs for supported qconfig for floating point modules/ops can be found in `quantization docs <https://pytorch.ac.cn/docs/stable/quantization.html#module-torch.quantization>`_
# Full docs for `QConfigMapping <https://pytorch.ac.cn/docs/stable/generated/torch.ao.quantization.qconfig_mapping.QConfigMapping.html#torch.ao.quantization.qconfig_mapping.QConfigMapping>`_
qconfig_mapping = (QConfigMapping()
    .set_object_type(nn.Embedding, float_qparams_weight_only_qconfig)
    .set_object_type(nn.LSTM, default_dynamic_qconfig)
    .set_object_type(nn.Linear, default_dynamic_qconfig)
)
# Load model to create the original model because quantization api changes the model inplace and we want
# to keep the original model for future comparison


model_to_quantize = LSTMModel(
    ntoken = ntokens,
    ninp = 512,
    nhid = 256,
    nlayers = 5,
)

model_to_quantize.load_state_dict(
    torch.load(
        model_data_filepath + 'word_language_model_quantize.pth',
        map_location=torch.device('cpu')
        )
    )

model_to_quantize.eval()


prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
print("prepared model:", prepared_model)
quantized_model = convert_fx(prepared_model)
print("quantized model", quantized_model)

对于动态量化的对象，我们在 prepare_fx 中没有对模块做任何操作，但会为可动态量化的函数式和 Torch 操作插入观察器。我们还会融合像 Conv + Bn、Linear + ReLU 这样的模块。

在转换过程中，我们将把浮点模块转换为动态量化模块，并将浮点操作转换为动态量化操作。在示例模型中，我们可以看到 nn.Embedding、nn.Linear 和 nn.LSTM 被动态量化。

现在我们可以比较量化模型的大小和运行时间。

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

由于我们将模型中所有权重（nn.Embedding、nn.Linear 和 nn.LSTM）从浮点数（4 字节）量化到量化整数（1 字节），因此大小减少了 4 倍。

torch.set_num_threads(1)

def time_model_evaluation(model, test_data):
    s = time.time()
    loss = evaluate(model, test_data)
    elapsed = time.time() - s
    print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed))

time_model_evaluation(model, test_data)
time_model_evaluation(quantized_model, test_data)

此模型的速度提高了大约 2 倍。另请注意，加速效果可能因模型、设备、构建、输入批次大小、线程等因素而异。

3. 结论¶

本教程介绍了 FX 图模式下训练后动态量化的 API，该 API 对与 Eager 模式量化相同的模块进行动态量化。

脚本总运行时间：（0 分钟 0.000 秒）

由 Sphinx-Gallery 生成的图库

(原型) FX 图模式训练后动态量化¶

1. 定义模型、下载数据和模型¶

2. 训练后动态量化¶

3. 结论¶

文档

教程

资源