Skip to main content

Text Vector Search

You'll find this example as well as the saved template in the main repository of superduper. See here.

If you'd like to modify the template, or practice building it yourself, then you can rerun the build.ipynb notebook in the template directory

Connect to superduper​

from superduper import superduper

db = superduper('mongomock://test_db')

Get useful sample data​

!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json
import json

with open('text.json', 'r') as f:
data = json.load(f)
datas = [{'x': d} for d in data]

Create datatype​

SuperduperDB supports automatic data conversion, so users don’t need to worry about the compatibility of different data formats (PIL.Image, numpy.array, pandas.DataFrame, etc.) with the database.

It also supports custom data conversion methods for transforming data, such as defining the following Datatype.

datatype = 'str'        

Setup tables or collections​

from superduper.components.table import Table
from superduper import Schema

schema = Schema(identifier="schema", fields={"x": datatype})
table = Table("docs", schema=schema)
select = db['docs'].select()
note

Note that applying a chunker is not mandatory for search. If your data is already chunked (e.g. short text snippets or audio) or if you are searching through something like images, which can't be chunked, then this won't be necessary.

from superduper import model

CHUNK_SIZE = 200

@model(flatten=True, model_update_kwargs={'document_embedded': False})
def chunker(text):
text = text.split()
chunks = [' '.join(text[i:i + CHUNK_SIZE]) for i in range(0, len(text), CHUNK_SIZE)]
return chunks

Now we wrap this chunker as a Listener, so that it processes incoming data

from superduper import Listener

upstream_listener = Listener(
model=chunker,
select=db['docs'].select(),
key='x',
uuid="chunk",
identifier='chunker',
)

Select outputs of upstream listener​

note

This is useful if you have performed a first step, such as pre-computing features, or chunking your data. You can use this query to operate on those outputs.

indexing_key = upstream_listener.outputs
indexing_key

Build text embedding model​

from superduper_openai import OpenAIEmbedding
import os

os.environ['OPENAI_API_KEY'] = 'sk-<secret>'

embedding_model = OpenAIEmbedding(identifier='text-embedding-ada-002')
print(len(embedding_model.predict("What is superduper")))

Create vector-index​

vector_index_name = 'my-vector-index'
from superduper import VectorIndex, Listener

vector_index = VectorIndex(
vector_index_name,
indexing_listener=Listener(
key=indexing_key, # the `Document` key `model` should ingest to create embedding
select=db[indexing_key].select(), # a `Select` query telling which data to search over
model=embedding_model, # a `_Predictor` how to convert data to embeddings
identifier=f'{embedding_model.identifier}-listener',
upstream=[table, upstream_listener], # this makes sure that the table is already set up when the other components are triggered
)
)
from superduper import Application

application = Application(
'text-vector-search',
components=[
table,
upstream_listener,
vector_index,
]
)
db.apply(application)
application.info(verbosity=2)
db['docs'].insert(datas).execute()
select = db['docs'].select()
db.databackend.db.list_collection_names()
from superduper import Document
# Perform the vector search based on the query
item = Document({indexing_key: "Tell me about vector-search"})
results = db[indexing_key].like(item, vector_index=vector_index_name, n=10).select().execute()
for result in results:
print("\n", '-' * 20, '\n')
print(Document(result.unpack())[indexing_key])
from superduper import Template

t = Template(
'vector-search',
template=application,
substitutions={'docs': 'table_name'},
)
t.export('.')
!cat component.json | jq .