https://github.com/codertimo/BERT-pytorch
codertimo/BERT-pytorch
Google AI 2018 BERT pytorch implementation. Contribute to codertimo/BERT-pytorch development by creating an account on GitHub.
github.com
tokenized corpus 가 필요
tokenizer 선택 : mecab으로
https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md
Bitbucket
bitbucket.org
현재 SOTA를 갱신하고 있는 NLP 모델들은 거의 transformer network를 기반으로 구축되어있다. 그리고 Pytorch로 이를 구현한 라이브러리는 Hugging face의 pytorch-transformers이다.
huggingface/transformers
🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0. - huggingface/transformers
github.com
pytorch-transformers 라이브러리 설치
!pip install pytorch-transformer
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F
!git clone https://github.com/e9t/nsmc.git
train_df = pd.read_csv('./nsmc/ratings_train.txt', sep='\t')
test_df = pd.read_csv('./nsmc/ratings_test.txt', sep='\t')
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
train_df = train_df.sample(frac=0.4, random_state=999)
test_df = test_df.sample(frac=0.4, random_state=999)
class NsmcDataset(Dataset):
''' Naver Sentiment Movie Corpus Dataset '''
def __init__(self, df):
self.df = df
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
text = self.df.iloc[idx, 1]
label = self.df.iloc[idx, 2]
return text, label
nsmc_train_dataset = NsmcDataset(train_df)
train_loader = DataLoader(nsmc_train_dataset, batch_size=2, shuffle=True, num_workers=2)
device = torch.device("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.to(device)
optimizer = Adam(model.parameters(), lr=1e-6)
itr = 1
p_itr = 500
epochs = 1
total_loss = 0
total_len = 0
total_correct = 0
model.train()
for epoch in range(epochs):
for text, label in train_loader:
optimizer.zero_grad()
# encoding and zero padding
encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
padded_list = [e + [0] * (512-len(e)) for e in encoded_list]
sample = torch.tensor(padded_list)
sample, label = sample.to(device), label.to(device)
labels = torch.tensor(label)
outputs = model(sample, labels=labels)
loss, logits = outputs
pred = torch.argmax(F.softmax(logits), dim=1)
correct = pred.eq(labels)
total_correct += correct.sum().item()
total_len += len(labels)
total_loss += loss.item()
loss.backward()
optimizer.step()
if itr % p_itr == 0:
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
total_loss = 0
total_len = 0
total_correct = 0
itr+=1
# evaluation
model.eval()
nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)
total_loss = 0
total_len = 0
total_correct = 0
for text, label in eval_loader:
encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
padded_list = [e + [0] * (512-len(e)) for e in encoded_list]
sample = torch.tensor(padded_list)
sample, label = sample.to(device), label.to(device)
labels = torch.tensor(label)
outputs = model(sample, labels=labels)
_, logits = outputs
pred = torch.argmax(F.softmax(logits), dim=1)
correct = pred.eq(labels)
total_correct += correct.sum().item()
total_len += len(labels)
print('Test accuracy: ', total_correct / total_len)
Multi-label Text Classification using BERT
반응형
'스타트업 > AI' 카테고리의 다른 글
[AI] Tokenizer 만들기 (0) | 2020.08.21 |
---|---|
[AI] 자연어 처리 (0) | 2020.08.19 |
[AI] firebase (0) | 2020.08.14 |
[AI] tensorflow serving (0) | 2020.08.14 |
[AI] Faiss (0) | 2020.08.14 |