Bläddra i källkod

增加 fastchat api LLM调用工具

glide-the 2 år sedan
förälder
incheckning
e16fcbe641
5 ändrade filer med 741 tillägg och 0 borttagningar
  1. 0 0
      fastchat/__init__.py
  2. 1 0
      fastchat/api/__init__.py
  3. 261 0
      fastchat/api/conversation.py
  4. 478 0
      fastchat/api/fastchat_api.py
  5. 1 0
      requirements.txt

+ 0 - 0
fastchat/__init__.py


+ 1 - 0
fastchat/api/__init__.py

@@ -0,0 +1 @@
+from .fastchat_api import *

+ 261 - 0
fastchat/api/conversation.py

@@ -0,0 +1,261 @@
+"""
+Conversation prompt template.
+
+Now we support
+- Vicuna
+- Koala
+- OpenAssistant/oasst-sft-1-pythia-12b
+- StabilityAI/stablelm-tuned-alpha-7b
+- databricks/dolly-v2-12b
+- THUDM/chatglm-6b
+- Alpaca/LLaMa
+"""
+
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple, Any
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+
+    SINGLE = auto()
+    TWO = auto()
+    DOLLY = auto()
+    OASST_PYTHIA = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+
+    # Used for gradio server
+    skip_next: bool = False
+    conv_id: Any = None
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += self.sep + " " + role + ": " + message
+                else:
+                    ret += self.sep + " " + role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.OASST_PYTHIA:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            conv_id=self.conv_id,
+        )
+
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "conv_id": self.conv_id,
+        }
+
+
+conv_one_shot = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        (
+            "Human",
+            "What are the key differences between renewable and non-renewable energy sources?",
+        ),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+
+conv_vicuna_v1_1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+
+conv_koala_v1 = Conversation(
+    system="BEGINNING OF CONVERSATION:",
+    roles=("USER", "GPT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_dolly = Conversation(
+    system="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+    roles=("### Instruction", "### Response"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.DOLLY,
+    sep="\n\n",
+    sep2="### End",
+)
+
+conv_oasst = Conversation(
+    system="",
+    roles=("<|prompter|>", "<|assistant|>"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.OASST_PYTHIA,
+    sep="<|endoftext|>",
+)
+
+conv_stablelm = Conversation(
+    system="""<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+""",
+    roles=("<|USER|>", "<|ASSISTANT|>"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.OASST_PYTHIA,
+    sep="",
+)
+
+conv_templates = {
+    "conv_one_shot": conv_one_shot,
+    "vicuna_v1.1": conv_vicuna_v1_1,
+    "koala_v1": conv_koala_v1,
+    "dolly": conv_dolly,
+    "oasst": conv_oasst,
+}
+
+
+def get_default_conv_template(model_name):
+    model_name = model_name.lower()
+    if "vicuna" in model_name or "output" in model_name:
+        return conv_vicuna_v1_1
+    elif "koala" in model_name:
+        return conv_koala_v1
+    elif "dolly-v2" in model_name:
+        return conv_dolly
+    elif "oasst" in model_name and "pythia" in model_name:
+        return conv_oasst
+    elif "stablelm" in model_name:
+        return conv_stablelm
+    return conv_one_shot
+
+
+def compute_skip_echo_len(model_name, conv, prompt):
+    model_name = model_name.lower()
+    if "chatglm" in model_name:
+        skip_echo_len = len(conv.messages[-2][1]) + 1
+    elif "dolly-v2" in model_name:
+        special_toks = ["### Instruction:", "### Response:", "### End"]
+        skip_echo_len = len(prompt)
+        for tok in special_toks:
+            skip_echo_len -= prompt.count(tok) * len(tok)
+    elif "oasst" in model_name and "pythia" in model_name:
+        special_toks = ["<|prompter|>", "<|assistant|>", "<|endoftext|>"]
+        skip_echo_len = len(prompt)
+        for tok in special_toks:
+            skip_echo_len -= prompt.count(tok) * len(tok)
+    elif "stablelm" in model_name:
+        special_toks = ["<|SYSTEM|>", "<|USER|>", "<|ASSISTANT|>"]
+        skip_echo_len = len(prompt)
+        for tok in special_toks:
+            skip_echo_len -= prompt.count(tok) * len(tok)
+    else:
+        skip_echo_len = len(prompt) + 1 - prompt.count("</s>") * 3
+    return skip_echo_len
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

+ 478 - 0
fastchat/api/fastchat_api.py

@@ -0,0 +1,478 @@
+"""Wrapper around FastChat APIs."""
+from __future__ import annotations
+
+import logging
+import sys
+import warnings
+from typing import (
+    AbstractSet,
+    Any,
+    Callable,
+    Collection,
+    Dict,
+    Generator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+from pydantic import Extra, Field, root_validator
+from tenacity import (
+    before_sleep_log,
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from langchain.llms.base import BaseLLM
+from langchain.schema import Generation, LLMResult
+from langchain.utils import get_from_dict_or_env
+from .conversation import (
+    get_default_conv_template,
+    compute_skip_echo_len,
+    SeparatorStyle,
+)
+import requests
+import json
+
+
+logger = logging.getLogger(__name__)
+FAST_CHAT_API = "http://localhost:21001/worker_generate_stream"
+
+
+def _streaming_response_template() -> Dict[str, Any]:
+    """
+    :return: 响应结构
+    """
+    return {
+        "text": "",
+        "error_code": 0,
+    }
+
+
+def _update_response(response: Dict[str, Any], stream_response: Dict[str, Any]) -> None:
+    """Update response from the stream response."""
+    response["text"] += stream_response["text"]
+    response["error_code"] += stream_response["error_code"]
+
+
+class BaseFastChat(BaseLLM):
+    """Wrapper around FastChat large language models."""
+
+    model_name: str = "text-davinci-003"
+    """Model name to use."""
+    temperature: float = 0.7
+    """What sampling temperature to use."""
+    max_new_tokens: int = 256
+    stop: int = 20
+    batch_size: int = 20
+    """Maximum number of retries to make when generating."""
+    streaming: bool = False
+    """Penalizes repeated tokens."""
+    n: int = 1
+    """Whether to stream the results or not."""
+    allowed_special: Union[Literal["all"], AbstractSet[str]] = set()
+    """Set of special tokens that are allowed。"""
+    disallowed_special: Union[Literal["all"], Collection[str]] = "all"
+    """Set of special tokens that are not allowed。"""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.ignore
+
+    @root_validator(pre=True)
+    def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Build extra kwargs from additional params that were passed in."""
+        all_required_field_names = {field.alias for field in cls.__fields__.values()}
+
+        extra = values.get("model_kwargs", {})
+        for field_name in list(values):
+            if field_name not in all_required_field_names:
+                if field_name in extra:
+                    raise ValueError(f"Found {field_name} supplied twice.")
+                logger.warning(
+                    f"""WARNING! {field_name} is not default parameter.
+                    {field_name} was transfered to model_kwargs.
+                    Please confirm that {field_name} is what you intended."""
+                )
+                extra[field_name] = values.pop(field_name)
+        values["model_kwargs"] = extra
+        return values
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling FastChat API."""
+        normal_params = {
+            "model": self.model_name,
+            "prompt": '',
+            "max_new_tokens": self.max_new_tokens,
+            "temperature": self.temperature,
+        }
+
+        return {**normal_params}
+
+    def _generate(
+            self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Call out to FastChat's endpoint with k unique prompts.
+
+        Args:
+            prompts: The prompts to pass into the model.
+            stop: Optional list of stop words to use when generating.
+
+        Returns:
+            The full LLM output.
+
+        Example:
+            .. code-block:: python
+
+                response = fastchat.generate(["Tell me a joke."])
+        """
+        # TODO: write a unit test for this
+        params = self._invocation_params
+        sub_prompts = self.get_sub_prompts(params, prompts)
+        choices = []
+        token_usage: Dict[str, int] = {}
+        headers = {"User-Agent": "fastchat Client"}
+        for _prompts in sub_prompts:
+
+            conv = get_default_conv_template(self.model_name).copy()
+            conv.append_message(conv.roles[0], _prompts[0])
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            params["prompt"] = prompt
+
+            # use get_default_conv_template stop unit
+            if stop is not None:
+                if "stop" in params:
+                    raise ValueError("`stop` found in both the input and default params.")
+                params["stop"] = stop[0]
+
+            if self.streaming:
+                if len(_prompts) > 1:
+                    raise ValueError("Cannot stream results with multiple prompts.")
+
+                response_template = _streaming_response_template()
+                response = requests.post(
+                    FAST_CHAT_API,
+                    headers=headers,
+                    json=params,
+                    stream=True,
+                    )
+                for stream_resp in response.iter_lines(
+                        chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+                ):
+                    if stream_resp:
+                        data = json.loads(stream_resp.decode("utf-8"))
+                        skip_echo_len = compute_skip_echo_len(self.model_name, conv, prompt)
+                        output = data["text"][skip_echo_len:].strip()
+                        data["text"] = output
+                        self.callback_manager.on_llm_new_token(
+                            output,
+                            verbose=self.verbose,
+                            logprobs=data["error_code"],
+                        )
+                        _update_response(response_template, data)
+                choices.append(response_template)
+            else:
+                response_template = _streaming_response_template()
+                response = requests.post(
+                    FAST_CHAT_API,
+                    headers=headers,
+                    json=params,
+                    stream=True,
+                    )
+                for stream_resp in response.iter_lines(
+                        chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+                ):
+                    if stream_resp:
+                        data = json.loads(stream_resp.decode("utf-8"))
+                        skip_echo_len = compute_skip_echo_len(self.model_name, conv, prompt)
+                        output = data["text"][skip_echo_len:].strip()
+                        data["text"] = output
+                        _update_response(response_template, data)
+
+                choices.append(response_template)
+
+        return self.create_llm_result(choices, prompts, token_usage)
+
+    async def _agenerate(
+            self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Call out to FastChat's endpoint async with k unique prompts."""
+        params = self._invocation_params
+        sub_prompts = self.get_sub_prompts(params, prompts)
+        choices = []
+        token_usage: Dict[str, int] = {}
+
+        headers = {"User-Agent": "fastchat Client"}
+        for _prompts in sub_prompts:
+
+            conv = get_default_conv_template(self.model_name).copy()
+            conv.append_message(conv.roles[0], _prompts[0])
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            params["prompt"] = prompt
+            # use get_default_conv_template stop unit
+            if stop is not None:
+                if "stop" in params:
+                    raise ValueError("`stop` found in both the input and default params.")
+                params["stop"] = stop[0]
+
+            if self.streaming:
+                if len(_prompts) > 1:
+                    raise ValueError("Cannot stream results with multiple prompts.")
+
+                response_template = _streaming_response_template()
+                response = requests.post(
+                    FAST_CHAT_API,
+                    headers=headers,
+                    json=params,
+                    stream=True,
+                    )
+                for stream_resp in response.iter_lines(
+                        chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+                ):
+                    if stream_resp:
+                        data = json.loads(stream_resp.decode("utf-8"))
+                        skip_echo_len = compute_skip_echo_len(self.model_name, conv, prompt)
+                        output = data["text"][skip_echo_len:].strip()
+                        data["text"] = output
+                        self.callback_manager.on_llm_new_token(
+                            output,
+                            verbose=self.verbose,
+                            logprobs=data["error_code"],
+                        )
+                        _update_response(response_template, data)
+                choices.append(response_template)
+            else:
+                response_template = _streaming_response_template()
+                response = requests.post(
+                    FAST_CHAT_API,
+                    headers=headers,
+                    json=params,
+                    stream=True,
+                    )
+                for stream_resp in response.iter_lines(
+                        chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+                ):
+                    if stream_resp:
+                        data = json.loads(stream_resp.decode("utf-8"))
+                        skip_echo_len = compute_skip_echo_len(self.model_name, conv, prompt)
+                        output = data["text"][skip_echo_len:].strip()
+                        data["text"] = output
+                        _update_response(response_template, data)
+
+                choices.append(response_template)
+
+        return self.create_llm_result(choices, prompts, token_usage)
+
+    def get_sub_prompts(
+            self,
+            params: Dict[str, Any],
+            prompts: List[str],
+    ) -> List[List[str]]:
+        """Get the sub prompts for llm call."""
+        if params["max_new_tokens"] == -1:
+            if len(prompts) != 1:
+                raise ValueError(
+                    "max_new_tokens set to -1 not supported for multiple inputs."
+                )
+            params["max_new_tokens"] = self.max_new_tokens_for_prompt(prompts[0])
+        # append pload
+        sub_prompts = [
+            prompts[i: i + self.batch_size]
+            for i in range(0, len(prompts), self.batch_size)
+        ]
+
+        return sub_prompts
+
+    def create_llm_result(
+            self, choices: Any, prompts: List[str], token_usage: Dict[str, int]
+    ) -> LLMResult:
+        """Create the LLMResult from the choices and prompts."""
+        generations = []
+        for i, _ in enumerate(prompts):
+            sub_choices = choices[i * self.n: (i + 1) * self.n]
+            generations.append(
+                [
+                    Generation(
+                        text=choice["text"],
+                        generation_info=dict(
+                            finish_reason='over',
+                            logprobs=choice["text"],
+                        ),
+                    )
+                    for choice in sub_choices
+                ]
+            )
+        llm_output = {"token_usage": token_usage, "model_name": self.model_name}
+        return LLMResult(generations=generations, llm_output=llm_output)
+
+    def stream(self, prompt: str, stop: Optional[List[str]] = None) -> Generator:
+        """Call FastChat with streaming flag and return the resulting generator.
+
+        BETA: this is a beta feature while we figure out the right abstraction.
+        Once that happens, this interface could change.
+
+        Args:
+            prompt: The prompts to pass into the model.
+            stop: Optional list of stop words to use when generating.
+
+        Returns:
+            A generator representing the stream of tokens from OpenAI.
+
+        Example:
+            .. code-block:: python
+
+                generator = fastChat.stream("Tell me a joke.")
+                for token in generator:
+                    yield token
+        """
+        params = self._invocation_params
+        conv = get_default_conv_template(self.model_name).copy()
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        params["prompt"] = prompt
+        # use get_default_conv_template stop unit
+        if stop is not None:
+            if "stop" in params:
+                raise ValueError("`stop` found in both the input and default params.")
+            params["stop"] = stop[0]
+
+        headers = {"User-Agent": "fastchat Client"}
+        response = requests.post(
+            FAST_CHAT_API,
+            headers=headers,
+            json=params,
+            stream=True,
+        )
+        for stream_resp in response.iter_lines(
+                chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+        ):
+            if stream_resp:
+                data = json.loads(stream_resp.decode("utf-8"))
+                skip_echo_len = compute_skip_echo_len(self.model_name, conv, prompt)
+                output = data["text"][skip_echo_len:].strip()
+                data["text"] = output
+                yield data
+
+    @property
+    def _invocation_params(self) -> Dict[str, Any]:
+        """Get the parameters used to invoke the model."""
+        return self._default_params
+
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        """Get the identifying parameters."""
+        return {**{"model_name": self.model_name}, **self._default_params}
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "fastChat"
+
+    def get_num_tokens(self, text: str) -> int:
+        """Calculate num tokens with tiktoken package."""
+        # tiktoken NOT supported for Python < 3.8
+        if sys.version_info[1] < 8:
+            return super().get_num_tokens(text)
+        try:
+            import tiktoken
+        except ImportError:
+            raise ValueError(
+                "Could not import tiktoken python package. "
+                "This is needed in order to calculate get_num_tokens. "
+                "Please install it with `pip install tiktoken`."
+            )
+
+        enc = tiktoken.encoding_for_model(self.model_name)
+
+        tokenized_text = enc.encode(
+            text,
+            allowed_special=self.allowed_special,
+            disallowed_special=self.disallowed_special,
+        )
+
+        # calculate the number of tokens in the encoded text
+        return len(tokenized_text)
+
+    def modelname_to_contextsize(self, modelname: str) -> int:
+        """Calculate the maximum number of tokens possible to generate for a model.
+
+        Args:
+            modelname: The modelname we want to know the context size for.
+
+        Returns:
+            The maximum context size
+
+        Example:
+            .. code-block:: python
+
+                max_new_tokens = openai.modelname_to_contextsize("text-davinci-003")
+        """
+        model_token_mapping = {
+            "vicuna-13b": 2049,
+            "koala": 2049,
+            "dolly-v2": 2049,
+            "oasst": 2049,
+            "stablelm": 2049,
+        }
+
+        context_size = model_token_mapping.get(modelname, None)
+
+        if context_size is None:
+            raise ValueError(
+                f"Unknown model: {modelname}. Please provide a valid OpenAI model name."
+                "Known models are: " + ", ".join(model_token_mapping.keys())
+            )
+
+        return context_size
+
+    def max_new_tokens_for_prompt(self, prompt: str) -> int:
+        """Calculate the maximum number of tokens possible to generate for a prompt.
+
+        Args:
+            prompt: The prompt to pass into the model.
+
+        Returns:
+            The maximum number of tokens to generate for a prompt.
+
+        Example:
+            .. code-block:: python
+
+                max_new_tokens = openai.max_token_for_prompt("Tell me a joke.")
+        """
+        num_tokens = self.get_num_tokens(prompt)
+
+        # get max context size for model by name
+        max_size = self.modelname_to_contextsize(self.model_name)
+        return max_size - num_tokens
+
+
+class FastChat(BaseFastChat):
+    """Wrapper around OpenAI large language models.
+
+    To use, you should have the ``openai`` python package installed, and the
+    environment variable ``OPENAI_API_KEY`` set with your API key.
+
+    Any parameters that are valid to be passed to the openai.create call can be passed
+    in, even if not explicitly saved on this class.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.llms import OpenAI
+            openai = FastChat(model_name="vicuna")
+    """
+
+    @property
+    def _invocation_params(self) -> Dict[str, Any]:
+        return {**{"model": self.model_name}, **super()._invocation_params}

+ 1 - 0
requirements.txt

@@ -12,3 +12,4 @@ gradio>=3.25.0
 accelerate==0.18.0
 llama-cpp-python==0.1.34; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.34/llama_cpp_python-0.1.34-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+peft