GPU单卡多进程

由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:2 年前

如题

这里利用了清华开源的chatglm-6b模型对pubmed上的文献摘要进行总结式概括。
代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import re
# 设置CUDA_VISIBLE_DEVICES环境变量
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
from functools import partial
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
from torch.multiprocessing import Pool
from tqdm import tqdm
import summarize

if __name__ == '__main__':
torch.multiprocessing.set_start_method('spawn')
df = pd.read_excel('/data/database/pubmed/macrophage_disease.xlsx')
print(df.shape)
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
model = model.eval()

partial_process_data = partial(summarize.summarize_abstract, tokenizer=tokenizer, model=model)

summary_ls = []
with Pool(processes=4) as pool:
for r in tqdm(pool.imap(partial_process_data, df['abstract']), total=len(df['abstract'])):
summary_ls.append(r)

df['summmary'] = summary_ls

df.to_csv('pubmed_summary.txt', index=False, sep='\t')

另外summarize.py代码如下, summarize_abstract函数必须写到一个文件中:

1
2
3
4
5
6
7
def summarize_abstract(abstract, tokenizer, model):

# Remove non-alphanumeric characters and extra whitespace
# abstract = re.sub(r'[^a-zA-Z0-9\s]', '', abstract).strip()
prompt = f"请用一句话总结以下摘要并且总结中必须要包含macrophage和相关的疾病,字数控制在50以内:{abstract}"
response, history = model.chat(tokenizer, prompt, history=[])
return response

参考:https://blog.csdn.net/junjunzai123/article/details/126286131