Skip to content

Commit

Permalink
add metadata filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
omar-sol committed Feb 20, 2024
1 parent beeea5a commit e0aadb4
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 74 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,4 @@ cython_debug/
#.idea/

notebooks/mini-llama-articles/
.vscode/
151 changes: 77 additions & 74 deletions scripts/ai-tutor.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -196,7 +196,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -207,105 +207,67 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
"from llama_index.llms.openai import OpenAI\n",
"from llama_index.core.vector_stores import (\n",
" ExactMatchFilter,\n",
" MetadataFilters,\n",
" MetadataFilter,\n",
" FilterOperator,\n",
" FilterCondition,\n",
")\n",
"\n",
"\n",
"filters = MetadataFilters(\n",
" filters=[\n",
" MetadataFilter(key=\"source\", value=\"lanchain_course\"),\n",
" MetadataFilter(key=\"source\", value=\"langchain_docs\"),\n",
" ],\n",
" condition=FilterCondition.OR,\n",
")\n",
"\n",
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-0125\", max_tokens=None)\n",
"embeds = OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"text_search\")\n",
"query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)"
"# query_engine = index.as_query_engine(\n",
"# llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters\n",
"# )\n",
"query_engine = index.as_query_engine(\n",
" llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = query_engine.query(\"What is the LLama model?\")"
"res = query_engine.query(\"What is the LLama model?\")\n",
"\n",
"# history = \"\" \n",
"# for token in res.response_gen:\n",
"# history += token\n",
"# print(history)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The Llama model is a new family of pre-trained and finetuned models released by Meta in mid-July. It includes different sizes such as 7B, 13B, and 70B, with corresponding papers describing their characteristics and learning process. The models are based on the standard transformer architecture and utilize techniques like RMSNorm normalization, SwiGLU activation, and rotatory positional embedding. The 70B model specifically applies the grouped-query attention (GQA) technique to speed up inference.'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"res.response"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node ID\t 7307e8a4-c4bd-4992-a68c-5230340f01c7\n",
"Source\t hf_transformers\n",
"Title\t Train\n",
"Text\t ged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole\n",
"dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too! Loading data as a tf.data.Dataset If you want to avoid slowing down training, you can load your data as a tf.data.Dataset instead. Although you can write your own\n",
"tf.data pipeline if you want, we have two convenience methods for doing this: prepare_tf_dataset(): This is the method we recommend in most cases. Because it is a method\n",
"on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and\n",
"discard the others to make a simpler, more performant dataset. to_tf_dataset: This method is more low-level, and is useful when you want to exactly control how\n",
"your dataset is created, by specifying exactly which columns and label_cols to include. Before you can use prepare_tf_dataset(), you will need to add the\n",
"Score\t 0.5175680124550022\n",
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
"Node ID\t 346a1018-8b33-4d83-b78f-2ba1b94f5e3b\n",
"Source\t openai\n",
"Title\t Researcher Access Program\n",
"Text\t There are a number of research directions we are excited to explore with the OpenAI API. If you are interested in the opportunity for subsidized access, please provide us with details about your research use case on the Researcher Access Program application.In particular, we consider the following to be especially important directions, though you are free to craft your own direction:Alignment: How can we understand what objective, if any, a model is best understood as pursuing? How do we increase the extent to which that objective is aligned with human preferences, such as via prompt design or fine-tuning?Fairness and representation: How should performance criteria be established for fairness and representation in language models? How can language models be improved in order to effectively support the goals of fairness and representation in specific, deployed contexts?Interdisciplinary research: How can AI development draw on insights from other disciplines such as philosophy, cognitive science, and sociolinguistics?Interpretability and transparency: How do these models work, mechanistically? Can we identify what concepts they're using, or extract latent knowledge from the model, make inferences about the training procedure, or predict surprising future behavior?Misuse potential: How can systems like the API be misused? What sorts of 'red teaming' approaches can we develop to help us and other AI developers think about responsibly deploying technologies like this?Model exploration: Models like those served by the API have a variety of capabilities which we have yet to explore. We're excited by investigations in many areas including model limitations, linguistic properties, commonsense reasoning, and potential uses for many other problems.Robustness: Generative models have uneven capability surfaces, with the potential for surprisingly strong and surprisingly weak areas of capability. How robust are large generative models to 'natural' perturbations in the prompt, such as phrasing the same idea in different ways or with or without typos? Can we predict the kinds of domains and tasks for which large generative models are more likely to be robust (or not robust), and how does this relate to the training data? Are there techniques we can use to predict and mitigate worst-case behavior? How can robustness be measured in the context of few-shot learning (e.g., across variations in prompts)? Can we train models so that they satisfy safety properties with a very high level of reliability, even under adversarial inputs?Please note that due to a high volume of requests, it takes time for us to review these applications and not all research will be prioritized for subsidy. We will only be in touch if your application is selected for subsidy.\n",
"Score\t 0.5129222370072439\n",
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
"Node ID\t ff0f2362-ddf7-4116-ac38-465dae37886a\n",
"Source\t towards_ai\n",
"Title\t Fine-Tuning a Llama-2 7B Model for Python Code Generation\n",
"Text\t New Llama-2 model In mid-July, Meta released its new family of pre-trained and finetuned models called Llama-2, with an open source and commercial character to facilitate its use and expansion. The base model was released with a chat version and sizes 7B, 13B, and 70B. Together with the models, the corresponding papers were published describing their characteristics and relevant points of the learning process, which provide very interesting information on the subject. For pre-training, 40% more tokens were used, reaching 2T, the context length was doubled and the grouped-query attention (GQA) technique was applied to speed up inference on the heavier 70B model. On the standard transformer architecture, RMSNorm normalization, SwiGLU activation, and rotatory positional embedding are used, the context length reaches 4096 tokens, and an Adam optimizer is applied with a cosine learning rate schedule, a weight decay of 0.1 and gradient clipping. \n",
"Score\t 0.49847282129286796\n",
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
"Node ID\t b0449da9-480c-48ea-80a3-35cfd84dbb48\n",
"Source\t hf_transformers\n",
"Title\t LayoutLMv2Tokenizer\n",
"Text\t class transformers.LayoutLMv2Tokenizer < source > ( vocab_file do_lower_case = True do_basic_tokenize = True never_split = None unk_token = '[UNK]' sep_token = '[SEP]' pad_token = '[PAD]' cls_token = '[CLS]' mask_token = '[MASK]' cls_token_box = [0, 0, 0, 0] sep_token_box = [1000, 1000, 1000, 1000] pad_token_box = [0, 0, 0, 0] pad_token_label = -100 only_label_first_subword = True tokenize_chinese_chars = True strip_accents = None model_max_length: int = 512 additional_special_tokens: typing.Optional[typing.List[str]] = None **kwargs ) Construct a LayoutLMv2 tokenizer. Based on WordPiece. LayoutLMv2Tokenizer can be used to turn words, word-level\n",
"bounding boxes and optional word labels to token-level input_ids, attention_mask, token_type_ids, bbox, and\n",
"optional labels (for token classification). This tokenizer inherits from PreTrainedTokenizer which contains most of the main methods. Users should refer to\n",
"this superclass for more information regarding those methods. LayoutLM\n",
"Score\t 0.488783381968426\n",
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n",
"Node ID\t bdb45412-1d60-4c22-9de7-a3a469ac675a\n",
"Source\t hf_transformers\n",
"Title\t Train\n",
"Text\t tokenizer outputs to your dataset as columns, as shown in\n",
"the following code sample: Copied def tokenize_dataset(data):\n",
" # Keys of the returned dictionary will be added to the dataset as columns\n",
" return tokenizer(data[\"text\"])\n",
"dataset = dataset.map(tokenize_dataset) Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the\n",
"columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly\n",
"reduces the number of padding tokens compared to padding the entire dataset. Copied >>> tf_dataset = model.prepare_tf_dataset(dataset[\"train\"], batch_size=16, shuffle=True, tokenizer=tokenizer) Note that in the code sample above, you need to pass the tokenizer to prepare_tf_dataset so it can correctly pad batches as they’re loaded.\n",
"If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument.\n",
"If you need to do something mor\n",
"Score\t 0.4819307254673903\n",
"-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n"
]
}
],
"outputs": [],
"source": [
"for src in res.source_nodes:\n",
" print(\"Node ID\\t\", src.node_id)\n",
Expand All @@ -316,6 +278,47 @@
" print(\"-_\"*20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Markdown, display\n",
"# define prompt viewing function\n",
"def display_prompt_dict(prompts_dict):\n",
" for k, p in prompts_dict.items():\n",
" text_md = f\"**Prompt Key**: {k}<br>\" f\"**Text:** <br>\"\n",
" display(Markdown(text_md))\n",
" print(p.get_template())\n",
" display(Markdown(\"<br><br>\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompts_dict = query_engine.get_prompts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display_prompt_dict(prompts_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit e0aadb4

Please sign in to comment.