diff --git a/docs/api.md b/docs/api.md index 9fec79a28..f361823bc 100644 --- a/docs/api.md +++ b/docs/api.md @@ -38,6 +38,7 @@ Advanced parameters: - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `system`: system prompt to (overrides what is defined in the `Modelfile`) - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`) +- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory ### Request @@ -71,6 +72,7 @@ The final response in the stream also includes additional data about the generat - `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt - `eval_count`: number of tokens the response - `eval_duration`: time in nanoseconds spent generating the response +- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`. @@ -78,6 +80,7 @@ To calculate how fast the response is generated in tokens per second (token/s), { "model": "llama2:7b", "created_at": "2023-08-04T19:22:45.499127Z", + "context": [1, 2, 3], "done": true, "total_duration": 5589157167, "load_duration": 3013701500, diff --git a/examples/python/client.py b/examples/python/client.py new file mode 100644 index 000000000..599ebec7b --- /dev/null +++ b/examples/python/client.py @@ -0,0 +1,38 @@ +import json +import requests + +# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve` +model = 'llama2' # TODO: update this for whatever model you wish to use + +def generate(prompt, context): + r = requests.post('http://localhost:11434/api/generate', + json={ + 'model': model, + 'prompt': prompt, + 'context': context, + }, + stream=True) + r.raise_for_status() + + for line in r.iter_lines(): + body = json.loads(line) + response_part = body.get('response', '') + # the response streams one token at a time, print that as we recieve it + print(response_part, end='', flush=True) + + if 'error' in body: + raise Exception(body['error']) + + if body.get('done', False): + return body['context'] + +def main(): + context = [] # the context stores a conversation history, you can use this to make the model more context aware + while True: + user_input = input("Enter a prompt: ") + print() + context = generate(user_input, context) + print() + +if __name__ == "__main__": + main() \ No newline at end of file