[TOC]
$ docker pull huggingface/transformers-pytorch-gpu:4.19.4
$ docker pull huggingface/transformers-pytorch-cpu:4.18.0
$ sudo docker run -it -d -v /Volumes/Yan_Errol/:/workspace --name transformers -p 9000:9000 -p 9001:22 32e2fefc85c5 bash
$ sudo docker exec -it transformers bash
$ apt update && apt install openssh-server
$ mkdir /var/run/sshd && echo 'root:passwd' | chpasswd
$ sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
$ sed 's@session\\\\srequired\\\\spam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
$ echo "export VISIBLE=now" >> /etc/profile
$ echo "PermitRootLogin yes" >> /etc/ssh/sshd_config
$ service ssh start
$ git lfs install
$ git clone <https://huggingface.co/bert-base-chinese>
# 1、单句分词
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./bert-base-chinese')
# 分词并编码
token = tokenizer.encode('北京欢迎你')
print(token)
# [101, 1266, 776, 3614, 6816, 872, 102]
# 简写形式
token = tokenizer('北京欢迎你')
# 解码
print(tokenizer.decode([101, 1266, 776, 3614, 6816, 872, 102]))
# 查看特殊标记
print(tokenizer.special_tokens_map)
# 查看特殊标记对应id
print(tokenizer.encode(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'], add_special_tokens=False))
# [100, 102, 0, 101, 103]
# 2、批处理
# 等长填充
batch_token1 = tokenizer(['北京欢迎你', '为你开天辟地'], padding=True, return_tensors='pt')
print(batch_token1)
print(batch_token1['input_ids'])
# 截断
batch_token2 = tokenizer(['北京欢迎你', '为你开天辟地'], max_length=5, truncation=True)
print(batch_token2)
# 填充到指定长度,超过的截断
batch_token3 = tokenizer(['北京欢迎你', '为你开天辟地'], max_length=10, truncation=True, padding='max_length')
print(batch_token3)
# 3、词向量编码
from transformers import BertModel
from transformers import logging
logging.set_verbosity_error()
model = BertModel.from_pretrained('./bert-base-chinese')
encoded = model(batch_token1['input_ids'])
print(encoded)
encoded_text = encoded[0]
print(encoded_text.shape)