samien
/
LSWenda


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
							from plugins.common import settings

def chat_init(history):
    return []


def chat_one(prompt, history_formatted, max_length, top_p, temperature, data):
    yield str(len(prompt))+'字正在计算'
    
    x = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
    y = model.generate(x, max_length=200, do_sample=True, top_p=0.95, top_k=4, temperature=0.2, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

# decoding, clean_up_tokenization_spaces=False to ensure syntactical correctness
    response = tokenizer.decode(y[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    yield response

def load_model():
    global model, tokenizer
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch
    tokenizer = AutoTokenizer.from_pretrained(
        settings.llm.path, local_files_only=True, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        settings.llm.path, local_files_only=True, trust_remote_code=True)
    device, precision = settings.llm.strategy.split()
# 根据设备执行不同的操作
    if device == 'cpu':
        # 如果是cpu，不做任何操作
        pass
    elif device == 'cuda':
        # 如果是gpu，把模型移动到显卡
        import torch

        # 根据精度执行不同的操作
        if precision == 'fp16':
            # 如果是fp16，把模型转化为半精度
            model = model.half()
        elif precision == 'fp32':
            # 如果是fp32，把模型转化为全精度
            model = model.float()
        elif precision.startswith('fp16i'):
            # 如果是fp16i开头，把模型转化为指定的精度
            # 从字符串中提取精度的数字部分
            bits = int(precision[5:])
            # 调用quantize方法，传入精度参数
            model = model.quantize(bits)
            model = model.half()
        elif precision.startswith('fp32i'):
            # 如果是fp32i开头，把模型转化为指定的精度
            # 从字符串中提取精度的数字部分
            bits = int(precision[5:])
            # 调用quantize方法，传入精度参数
            model = model.quantize(bits)
            model = model.float()
        else:
            # 如果是其他精度，报错并退出程序
            print('Error: 不受支持的精度')
            exit()
        model = model.to(torch.device("cuda"))    
    else:
        # 如果是其他设备，报错并退出程序
        print('Error: 不受支持的设备')
        exit()
    

    model = model.eval()