| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- from plugins.common import settings
- import json
- chatglm3_mode =settings.llm.path.lower().find("chatglm3-6b") > -1
- print('chatglm3_mode',chatglm3_mode)
- def chat_init(history):
- history_formatted = []
- if history is not None:
- tmp = []
- for i, old_chat in enumerate(history):
- if len(tmp) == 0 and old_chat['role'] == "user":
- if chatglm3_mode:
- history_formatted.append({'role': 'user', 'content':old_chat['content']})
- else:
- tmp.append(old_chat['content'])
- elif old_chat['role'] == "AI" or old_chat['role'] == 'assistant':
- if chatglm3_mode:
- history_formatted.append({'role': 'assistant', 'metadata': '', 'content':old_chat['content']})
- else:
- tmp.append(old_chat['content'])
- history_formatted.append(tuple(tmp))
- tmp = []
- elif old_chat['role'] == "system":
- if chatglm3_mode:
- history_formatted.append({'role': 'system', 'content':"Answer the following questions as best as you can. You have access to the following tools:", "tools":json.loads(old_chat['content'])})
- else:
- continue
- return history_formatted
- def chat_one(prompt, history_formatted, max_length, top_p, temperature, data):
- yield str(len(prompt))+'字正在计算'
-
- if len(history_formatted)>0 and history_formatted[0]['role']=="system":
- if prompt.startswith("observation!"):
- prompt = prompt.replace("observation!", "")
- response, history = model.chat(tokenizer, prompt, history_formatted, role="observation",
- max_length=max_length, top_p=top_p, temperature=temperature)
- yield response
- else:
- response, history = model.chat(tokenizer, prompt, history_formatted,
- max_length=max_length, top_p=top_p, temperature=temperature)
- yield json.dumps(response)
- else:
- for response, history in model.stream_chat(tokenizer, prompt, history_formatted,
- max_length=max_length, top_p=top_p, temperature=temperature):
- yield response
- def sum_values(dict):
- total = 0
- for value in dict.values():
- total += value
- return total
- def dict_to_list(d):
- l = []
- for k, v in d.items():
- l.extend([k] * v)
- return l
- def load_model():
- global model, tokenizer
- from transformers import AutoModel, AutoTokenizer
- num_trans_layers = 28
- strategy = ('->'.join([x.strip() for x in settings.llm.strategy.split('->')])).replace('->', ' -> ')
- s = [x.strip().split(' ') for x in strategy.split('->')]
- print(s)
- if len(s)>1:
- from accelerate import dispatch_model
- start_device = int(s[0][0].split(':')[1])
- #根据路径名判断,如果是glm2则使用专用devicemap,参见https://github.com/THUDM/ChatGLM2-6B/blob/main/utils.py Line23
- if "chatglm2" in settings.llm.path.lower():
- device_map = {'transformer.embedding.word_embeddings': 0,
- 'transformer.encoder.final_layernorm': 0,
- 'transformer.output_layer': 0,
- 'transformer.rotary_pos_emb': 0,
- 'lm_head': 0}
- else:
- device_map = {'transformer.word_embeddings': start_device,
- 'transformer.final_layernorm': start_device, 'lm_head': start_device}
-
- n = {}
- for i in range(len(s)):
- si = s[i]
- if len(s[i]) > 2:
- ss = si[2]
- if ss.startswith('*'):
- n[int(si[0].split(':')[1])]=int(ss[1:])
- else:
- n[int(si[0].split(':')[1])] = num_trans_layers+2-sum_values(n)
- n[start_device] -= 2
- n = dict_to_list(n)
- for i in range(num_trans_layers):
- #根据路径名判断,如果是glm2则使用专用devicemap,参见https://github.com/THUDM/ChatGLM2-6B/blob/main/utils.py Line23
- if "chatglm2" in settings.llm.path.lower():
- device_map[f'transformer.encoder.layers.{i}'] = n[i]
- else:
- device_map[f'transformer.layers.{i}'] = n[i]
- device, precision = s[0][0], s[0][1]
-
- tokenizer = AutoTokenizer.from_pretrained(
- settings.llm.path, local_files_only=True, trust_remote_code=True,revision="v1.1.0")
- model = AutoModel.from_pretrained(
- settings.llm.path, local_files_only=True, trust_remote_code=True, revision="v1.1.0")
- if not (settings.llm.lora == '' or settings.llm.lora == None):
- print('Lora模型地址', settings.llm.lora)
- from peft import PeftModel
- model = PeftModel.from_pretrained(model, settings.llm.lora,adapter_name=settings.llm.lora)
-
- # 根据设备执行不同的操作
- if device == 'cpu':
- # 如果是cpu,不做任何操作
- pass
- elif device == 'cuda':
- # 如果是gpu,把模型移动到显卡
- import torch
- if "chatglm2" in settings.llm.path and "int4" in settings.llm.path:
- model = model.cuda()
- elif not (precision.startswith('fp16i') and torch.cuda.get_device_properties(0).total_memory < 1.4e+10):
- model = model.cuda()
- elif len(s)>1 and device.startswith('cuda:'):
- pass
- else:
- # 如果是其他设备,报错并退出程序
- print('Error: 不受支持的设备')
- exit()
- # 根据精度执行不同的操作
- if precision == 'fp16':
- # 如果是fp16,把模型转化为半精度
- model = model.half()
- elif precision == 'fp32':
- # 如果是fp32,把模型转化为全精度
- model = model.float()
- elif precision.startswith('fp16i'):
- # 如果是fp16i开头,把模型转化为指定的精度
- # 从字符串中提取精度的数字部分
- bits = int(precision[5:])
- # 调用quantize方法,传入精度参数
- model = model.quantize(bits)
- if device == 'cuda':
- model = model.cuda()
- model = model.half()
- elif precision.startswith('fp32i'):
- # 如果是fp32i开头,把模型转化为指定的精度
- # 从字符串中提取精度的数字部分
- bits = int(precision[5:])
- # 调用quantize方法,传入精度参数
- model = model.quantize(bits)
- if device == 'cuda':
- model = model.cuda()
- model = model.float()
- else:
- # 如果是其他精度,报错并退出程序
- print('Error: 不受支持的精度')
- exit()
- if len(s)>1:
- model = dispatch_model(model, device_map=device_map)
- model = model.eval()
- if not (settings.llm.lora == '' or settings.llm.lora == None):
- from bottle import route, response, request
- @route('/lora_load_adapter', method=("POST","OPTIONS"))
- def load_adapter():
- # allowCROS()
- try:
- data = request.json
- lora_path=data.get("lora_path")
- adapter_name=data.get("adapter_name")
- model.load_adapter(lora_path, adapter_name=adapter_name)
- return "保存成功"
- except Exception as e:
- return str(e)
- @route('/lora_set_adapter', method=("POST","OPTIONS"))
- def set_adapter():
- # allowCROS()
- try:
- data = request.json
- adapter_name=data.get("adapter_name")
- model.set_adapter(adapter_name)
- return "保存成功"
- except Exception as e:
- return str(e)
|