duee篇章级触发词模型训练预测
import ast
import os
import json
import warnings
import random
from functools import partial
import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.transformers import AutoModelForTokenClassification, AutoTokenizer
from paddlenlp.metrics import ChunkEvaluator
def read_by_lines(path):
result = list()
with open(path, "r", encoding="utf8") as infile:
for line in infile:
result.append(line.strip())
return result
def write_by_lines(path, data):
with open(path, "w", encoding="utf8") as outfile:
[outfile.write(d + "\n") for d in data]
def load_dict(dict_path):
vocab = {}
for line in open(dict_path, 'r', encoding='utf-8'):
value, key = line.strip('\n').split('\t')
vocab[key] = int(value)
return vocab
num_epoch=3
learning_rate=5e-5
tag_path='./conf/DuEE-Fin/trigger_tag.dict'
train_data='./datasets/DuEE-Fin/trigger/train.tsv'
dev_data='./datasets/DuEE-Fin/trigger/dev.tsv'
test_data='./datasets/DuEE-Fin/trigger/test.tsv'
predict_data=None
warmup_proportion=0.0
batch_size=10
checkpoints='./checkpoints/Duee_extract/'
init_ckpt=None
predict_save_path=None
seed=1000
device='gpu'
weight_decay=0.0
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
from paddlenlp.datasets import MapDataset
def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp) # Skip header
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
yield words, labels
if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]
paddle.set_device(device)
set_seed(seed)
no_entity_label = 'O'
ignore_label = -1
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
label_map = load_dict
