3333Coherence Vectors
3434=================
3535
36- Coherence Python client can handle few different types of vector,
37- this example will use the FloatVector type
36+ Coherence Python client can handle a few different types of vector,
37+ this example will use the FloatVector type.
3838
3939Just like any other data type in Coherence, vectors are stored in normal
4040Coherence caches. The vector may be stored as the actual cache value,
5757the cache. The dataset is a json file and the example will use Coherence json
5858support to read and store the data.
5959
60- The schema of the JSON movie data looks like this
60+ The schema of the JSON movie data looks like this:
6161
6262+--------------------+-------------------------------------------------------+
6363| Field Name | Description |
100100
101101The SimilaritySearch aggregator is used to perform a Knn vector search on a
102102cache in the same way that normal Coherence aggregators are used.
103-
104103"""
105104
106105
@@ -115,7 +114,7 @@ class MovieRepository:
115114 """
116115
117116 EMBEDDING_DIMENSIONS : Final [int ] = 384
118- """Embedding dimension for all-MiniLM-L6-v2"""
117+ """Embedding dimension for all-MiniLM-L6-v2. """
119118
120119 VECTOR_FIELD : Final [str ] = "embeddings"
121120 """The name of the field in the json containing the embeddings."""
@@ -125,7 +124,7 @@ class MovieRepository:
125124
126125 def __init__ (self , movies : NamedMap ) -> None :
127126 """
128- Creates an instance of the MovieRepository
127+ Creates an instance of the MovieRepository.
129128
130129 :param movies: The Coherence NamedMap is the cache used to store the
131130 movie data.
@@ -137,10 +136,10 @@ def __init__(self, movies: NamedMap) -> None:
137136
138137 async def load (self , filename : str ) -> None :
139138 """
140- Loads the movie data into the NamedMao using the specified zip file
139+ Loads the movie data into the NamedMao using the specified zip file.
141140
142- :param filename: Name of the movies json zip file
143- :return: None
141+ :param filename: Name of the movies json zip file.
142+ :return: None.
144143 """
145144 try :
146145 with gzip .open (filename , "rt" , encoding = "utf-8" ) as f :
@@ -155,34 +154,35 @@ async def load(self, filename: str) -> None:
155154 try :
156155 f .close ()
157156 except NameError :
158- pass # File was never opened, so nothing to close
157+ pass # File was never opened, so nothing to close.
159158 except Exception as e :
160159 print (f"An error occurred while closing the file: { e } " )
161160
162161 # iterate over list of movie objects (dictionary) to load them into
163- # Coherence cache
162+ # Coherence cache.
164163 for movie in data :
165164 # get the title of the movie
166165 title : str = movie .get ("title" )
167166 # get the full plot of the movie
168167 full_plot : str = movie .get ("fullplot" )
169168 key : str = title
170- # text of the full_plot converted to a vector
169+ # text of the full_plot converted to a vector.
171170 vector : FloatVector = self .vectorize (full_plot )
172- # vector is added to the movie object
171+ # vector is added to the movie object.
173172 movie [self .VECTOR_FIELD ] = vector
174- # The movie object is added to the cache using the "title" field as the cache key
173+ # The movie object is added to the cache using the "title" field
174+ # as the cache key.
175175 await self .movies .put (key , movie )
176176
177177 def vectorize (self , input_string : str ) -> FloatVector :
178- """vectorize method takes a String value and returns a FloatVector"""
178+ """vectorize method takes a String value and returns a FloatVector. """
179179
180180 # model used to creat embeddings for the input_string
181- # in this example model used is onnx-models/all-MiniLM-L6-v2-onnx
181+ # in this example model used is onnx-models/all-MiniLM-L6-v2-onnx.
182182 embeddings : List [float ] = self .model .encode (input_string ).tolist ()
183183
184184 # The vector returned is normalized, which makes future operations on
185- # the vector more efficient
185+ # the vector more efficient.
186186 return FloatVector (Vectors .normalize (embeddings ))
187187
188188 async def search (self , search_text : str , count : int , filter : Filter = Filters .always ()) -> List [QueryResult ]:
@@ -194,23 +194,23 @@ async def search(self, search_text: str, count: int, filter: Filter = Filters.al
194194 parameter can be The filter is used to reduce the cache entries used
195195 to perform the k-nn search.
196196
197- :param search_text: the text to nearest match on the movie full plot
198- :param count: the count of the nearest matches to return :param
199- filter: an optional Filter to use to further reduce the movies to be
200- queried
201- :return: a List of QueryResult objects
197+ :param search_text: the text to nearest match on the movie full plot.
198+ :param count: the count of the nearest matches to return.
199+ :param filter: an optional Filter to use to further reduce the movies
200+ to be queried.
201+ :return: a List of QueryResult objects.
202202 """
203203
204204 # create a FloatVector of the search_text
205205 vector : FloatVector = self .vectorize (search_text )
206- # create the SimilaritySearch aggregator using the above vector and count
206+ # create the SimilaritySearch aggregator using the above vector and count.
207207 search : SimilaritySearch = SimilaritySearch (self .VALUE_EXTRACTOR , vector , count )
208208 # perform the k-nn search using the above aggregator and optional filter and
209- # returns a list of QueryResults
209+ # returns a list of QueryResults.
210210 return await self .movies .aggregate (search , filter = filter )
211211
212212
213- # Name of the compressed gzip json file that has data for the movies
213+ # Name of the compressed gzip json file that has data for the movies.
214214MOVIE_JSON_FILENAME : Final [str ] = "movies.json.gzip"
215215
216216
@@ -219,7 +219,7 @@ async def do_run() -> None:
219219 # Create a new session to the Coherence server using the default host and
220220 # port i.e. localhost:1408
221221 session : Session = await Session .create ()
222- # Create a NamedMao called movies with key of str and value of dict
222+ # Create a NamedMap called movies with key of str and value of dict
223223 movie_db : NamedMap [str , dict ] = await session .get_map ("movies" )
224224 try :
225225 # an instance of class MovieRepository is create passing the above
@@ -236,7 +236,7 @@ async def do_run() -> None:
236236 # the nearest matches. The second parameter is a count of the number
237237 # of nearest neighbours to search for.
238238 #
239- # Below a search for five movies roughly based on "star travel and space ships"
239+ # Below, a search for five movies roughly based on "star travel and space ships"
240240 # is being done
241241 results = await movies_repo .search ("star travel and space ships" , 5 )
242242 print ("Search results:" )
@@ -248,8 +248,8 @@ async def do_run() -> None:
248248 # to reduce the cache entries used to perform the nearest neighbours
249249 # (k-nn) search.
250250 #
251- # Below any movie with a plot similar to "star travel and space
252- # ships" was searched for. In addition a Filter is used to narrow down
251+ # Below, any movie with a plot similar to "star travel and space
252+ # ships" was searched for. In addition, a Filter is used to narrow down
253253 # the search i.e. movies that starred "Harrison Ford". The filter
254254 # will be applied to the cast field of the json object.
255255 cast_extractor = Extractors .extract ("cast" )
0 commit comments