import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

def get_mixtral():
    mixtral8x7b = LlamaCpp(
        model_path="model/weight/mixtral-8x7b-instruct-v0.1.Q6_K.gguf",
        temperature=0.75,
        n_gpu_layers=33,
        n_ctx=20000,
        n_thread=30,
        n_batch=32,
        max_tokens=2024,
        top_p=3,
        callback_manager=callback_manager,
        verbose=True,  # Verbose is required to pass to the callback manager
    )
    return mixtral8x7b

def get_nerualchat7bv3_2():
    neuralchat7bv3_2 = LlamaCpp(
        model_path="model/weight/neural-chat-7b-v3-2.Q5_K_M.gguf",
        temperature=0.75,
        n_gpu_layers=33,
        n_ctx=20000,
        n_thread=30,
        n_batch=32,
        max_tokens=512,
        top_p=1,
        callback_manager=callback_manager,
        verbose=True,  # Verbose is required to pass to the callback manager
    )
    return neuralchat7bv3_2