menu

2021-06-13 记事

  • date_range 13/06/2021 14:59 info
    sort
    Default
    label

2021-06-13 记事

#ROUGE 包评估汇总质量的Python包装器 https://pypi.org/project/rouge/

https://colab.research.google.com/drive/1-OEwiD9ouGjWrSFEWhgEnWiNvwwxlqd7#scrollTo=no6DwOqaE9Jw

 def shift\_tokens\_right(input\_ids, pad\_token\_id):

""" Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).

This is taken directly from modeling\_bart.py

"""

prev\_output\_tokens = input\_ids.clone()

index\_of\_eos = (input\_ids.ne(pad\_token\_id).sum(dim=1) - 1).unsqueeze(\-1)

prev\_output\_tokens\[:, 0\] = input\_ids.gather(1, index\_of\_eos).squeeze()

prev\_output\_tokens\[:, 1:\] = input\_ids\[:, :\-1\]

return prev\_output\_tokens

  

def encode\_sentences(tokenizer, source\_sentences, target\_sentences, max\_length\=32, pad\_to\_max\_length\=True, return\_tensors\="pt"):

''' Function that tokenizes a sentence

Args: tokenizer - the BART tokenizer; source and target sentences are the source and target sentences

Returns: Dictionary with keys: input\_ids, attention\_mask, target\_ids

'''

  

input\_ids = \[\]

attention\_masks = \[\]

target\_ids = \[\]

tokenized\_sentences = {}

  

for sentence in source\_sentences:

encoded\_dict = tokenizer(

sentence,

max\_length=max\_length,

padding="max\_length" if pad\_to\_max\_length else None,

truncation=True,

return\_tensors=return\_tensors,

add\_prefix\_space = True

)

  

input\_ids.append(encoded\_dict\['input\_ids'\])

attention\_masks.append(encoded\_dict\['attention\_mask'\])

  

input\_ids = torch.cat(input\_ids, dim = 0)

attention\_masks = torch.cat(attention\_masks, dim = 0)

  

for sentence in target\_sentences:

encoded\_dict = tokenizer(

sentence,

max\_length=max\_length,

padding="max\_length" if pad\_to\_max\_length else None,

truncation=True,

return\_tensors=return\_tensors,

add\_prefix\_space = True

)

\# Shift the target ids to the right

\# shifted\_target\_ids = shift\_tokens\_right(encoded\_dict\['input\_ids'\], tokenizer.pad\_token\_id)

target\_ids.append(encoded\_dict\['input\_ids'\])

  

target\_ids = torch.cat(target\_ids, dim = 0)

  

batch = {

"input\_ids": input\_ids,

"attention\_mask": attention\_masks,

"labels": target\_ids,

}

  

return batch

  
  

def noise\_sentence(sentence\_, percent\_words, replacement\_token = "<mask>"):

'''

Function that noises a sentence by adding <mask> tokens

Args: sentence - the sentence to noise

percent\_words - the percent of words to replace with <mask> tokens; the number is rounded up using math.ceil

Returns a noised sentence

'''

\# Create a list item and copy

sentence\_ = sentence\_.split(' ')

sentence = sentence\_.copy()

num\_words = math.ceil(len(sentence) \* percent\_words)

\# Create an array of tokens to sample from; don't include the last word as an option because in the case of lyrics

\# that word is often a rhyming word and plays an important role in song construction

sample\_tokens = set(np.arange(0, np.maximum(1, len(sentence)\-1)))

words\_to\_noise = random.sample(sample\_tokens, num\_words)

\# Swap out words, but not full stops

for pos in words\_to\_noise:

if sentence\[pos\] != '.':

sentence\[pos\] = replacement\_token

\# Remove redundant spaces

sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))

\# Combine concurrent <mask> tokens into a single token; this just does two rounds of this; more could be done

sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)

sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)

return sentence
 

## #kaggle上配合optuna

中文文档地址

https://optuna.readthedocs.io/zh_CN/latest/index.html

https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization

## 使用 #Ray #Tune 扩展 #Optuna OptunaRay Tune是 Python中超参数调优的两个主要工具。Optuna为高级超参数搜索算法(如Tree-Parzen Estimators )提供了一个易于使用的界面。这使其成为现代机器学习工程师或数据科学家的宝贵工具,也是其受欢迎的关键原因。

Optuna擅长单机工作负载,但并行处理这些工作负载需要手动操作,可能在多台机器上,并且不包括监控能力。如果您想尽可能高效利用 GPU,这会使操作变得特别具有挑战性。您不仅需要明智地选择参数,还需要一种组织执行的方法。这就是Ray Tune闪耀的地方。

Ray Tune接受您的训练功能并自动将其并行化,负责资源管理,甚至可以将其分布在一组机器上。您所要做的就是运行一个脚本!在一个完美的世界中,我们将能够同时使用Optuna的出色算法和Ray Tune的出色缩放功能。

https://medium.com/optuna/scaling-up-optuna-with-ray-tune-88f6ca87b8c7#:~:text=Ray%20Tune%20integrates%20with%20many,is%20one%20of%20these%20frameworks!&text=Even%20better%2C%20if%20you%20use,currently%20not%20available%20in%20Optuna.

## # Using PyTorch Lightning with Tune

文档在这里 https://docs.ray.io/en/master/tune/tutorials/tune-pytorch-lightning.html

## PyTorch Lightning ModelCheckpoint https://pytorch-lightning.readthedocs.io/en/stable/common/weights_loading.html

 from pytorch\_lightning.callbacks import ModelCheckpoint

class LitAutoEncoder(LightningModule):
    def validation\_step(self, batch, batch\_idx):
        x, y \= batch
        y\_hat \= self.backbone(x)
        loss \= F.cross\_entropy(y\_hat, y)
        self.log('val\_loss', loss)

\# saves a file like: my/path/sample-mnist-epoch=02-val\_loss=0.32.ckpt
checkpoint\_callback \= ModelCheckpoint(
    monitor\='val\_loss',
    dirpath\='my/path/',
    filename\='sample-mnist-{epoch:02d}\-{val\_loss:.2f}',  # 这个很重要,便于选择最优的检出点
    save\_top\_k\=3,
    mode\='min',
)

trainer \= Trainer(callbacks\=\[checkpoint\_callback\])
 
 

## #pytorch-lightning -template

https://github.com/PyTorchLightning/deep-learning-project-template

pytorch-lightning流程

 
 class LitModel(pl.LightningModule):

    def \_\_init\_\_(...):

    def forward(...):

    def training\_step(...)

    def training\_step\_end(...)

    def training\_epoch\_end(...)

    def validation\_step(...)

    def validation\_step\_end(...)

    def validation\_epoch\_end(...)

    def test\_step(...)

    def test\_step\_end(...)

    def test\_epoch\_end(...)

    def configure\_optimizers(...)

    def any\_extra\_hook(...)
 

## #pytorch-lightning-bolts Pretrained #SOTA Deep Learning models, callbacks and more for research and production with #PyTorch #Lightning and #PyTorch

https://pypi.org/project/pytorch-lightning-bolts/

## #SLURM 托管集群

Lightning 在 SLURM 驱动的集群上自动化训练背后的细节。与上面的通用集群相反,用户不会在每个节点上手动启动作业,而是将其提交给 SLURM,SLURM 会安排允许作业运行的资源和时间。

https://pytorch-lightning.readthedocs.io/en/stable/clouds/cluster.html#slurm-managed-cluster

## #PyTorch Tabular 处理表格数据, PyTorch Tabular aims to make Deep Learning with Tabular data easy and accessible to real-world cases and research alike. The core principles behind the design of the library are:

用法也很简单

task可选择regression /classification 回归和分类,话说不是所有的任务都可归为分类和回归吗?

 
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("examples/basic")
loaded_model = TabularModel.load_from_checkpoint("examples/basic")

 

https://github.com/manujosephv/pytorch_tabular

## #Pytorch-Forecasting _Pytorch Forecasting_是一个基于 PyTorch 的包,用于使用最先进的网络架构预测时间序列。它为Pandas数据帧上的训练网络提供了一个高级 API,并利用 PyTorch Lightning在(多个)GPU、CPU 和自动日志记录上进行可扩展的训练。

## #transformers 和pytorch_lightning构建qa模型,不要太简单

 
import nlp
import torch
import transformers as tfs
import pytorch_lightning as pl

class QAModel(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams
        self.model = tfs.AutoModelForQuestionAnswering.from_pretrained(hparams.qa_model)

    def forward(self, x):
        return self.model(**x)

    def training_step(self, batch, batch_idx):
        outputs = self(batch)
        loss = outputs[0]
        return {'loss': loss, 'log': {'train_loss': loss}}


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

    def train_dataloader(self):
        pass
 

https://github.com/tshrjn/Finetune-QA/blob/master/models.py

## #tpot 自动机器学习库基sklearn

https://github.com/EpistasisLab/tpot

## #autokeras 自动深度学习库

AutoKeras: An AutoML system based on Keras. It is developed by DATA Lab at Texas A&M University. The goal of AutoKeras is to make machine learning accessible to everyone.

## #H2O java H2O is an in-memory platform for distributed, scalable machine learning. H2O uses familiar interfaces like R, Python, Scala, Java, JSON and the Flow notebook/web interface, and works seamlessly with big data technologies like Hadoop and Spark. H2O provides implementations of many popular algorithms such as Generalized Linear Models (GLM), Gradient Boosting Machines (including XGBoost), Random Forests, Deep Neural Networks, Stacked Ensembles, Naive Bayes, Generalized Additive Models (GAM), Cox Proportional Hazards, K-Means, PCA, Word2Vec, as well as a fully automatic machine learning algorithm (H2O AutoML). https://github.com/h2oai/h2o-3

编辑