10000 using native vector functions · doraig/azure-sql-db-openai@a13814e · GitHub
[go: up one dir, main page]

Skip to content

Commit a13814e

Browse files
committed
using native vector functions
1 parent 7d138ae commit a13814e

8 files changed

+110
-276
lines changed

python/00-setup-database.sql

Lines changed: 7 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,21 @@
1-
drop table if exists dbo.document_embeddings
2-
drop table if exists dbo.documents
1+
drop table if exists dbo.sample_documents
32
go
43

5-
create table dbo.documents
4+
create table dbo.sample_documents
65
(
76
id int constraint pk__documents primary key,
87
content nvarchar(max),
9-
embedding nvarchar(max)
8+
embedding varbinary(8000)
109
)
11-
create table dbo.document_embeddings
12-
(
13-
id int references dbo.documents(id),
14-
vector_value_id int,
15-
vector_value float
16-
)
17-
go
18-
19-
create clustered columnstore index csi__document_embeddings
20-
on dbo.document_embeddings order (id)
21-
go
2210

23-
if not exists(select * from sys.fulltext_catalogs where [name] = 'FullTextCatalog')
11+
if not exists(select * from sys.fulltext_catalogs where [name] = 'main_ft_catalog')
2412
begin
25-
create fulltext catalog [FullTextCatalog] as default;
13+
create fulltext catalog [main_ft_catalog] as default;
2614
end
2715
go
2816

29-
create fulltext index on dbo.documents (content) key index pk__documents;
17+
create fulltext index on dbo.sample_documents (content) key index pk__documents;
3018
go
3119

32-
alter fulltext index on dbo.documents enable;
33-
go
34-
35-
create or alter function dbo.similar_documents(@vector nvarchar(max))
36-
returns table
37-
as
38-
return
39-
with cteVector as
40-
(
41-
select
42-
cast([key] as int) as [vector_value_id],
43-
cast([value] as float) as [vector_value]
44-
from
45-
openjson(@vector)
46-
),
47-
cteSimilar as
48-
(
49-
select top (50)
50-
v2.id,
51-
1-sum(v1.[vector_value] * v2.[vector_value]) /
52-
(
53-
sqrt(sum(v1.[vector_value] * v1.[vector_value]))
54-
*
55-
sqrt(sum(v2.[vector_value] * v2.[vector_value]))
56-
) as cosine_distance
57-
from
58-
cteVector v1
59-
inner join
60-
dbo.document_embeddings v2 on v1.vector_value_id = v2.vector_value_id
61-
group by
62-
v2.id
63-
order by
64-
cosine_distance
65-
)
66-
select
67-
rank() over (order by r.cosine_distance) as rank,
< 67E6 /td>
68-
r.id,
69-
r.cosine_distance
70-
from
71-
cteSimilar r
20+
alter fulltext index on dbo.sample_documents enable;
7221
go

python/hybrid_search.py

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
sentences = [
1515
'The dog is barking',
1616
'The cat is purring',
17-
'The bear is growling'
17+
'The bear is growling',
18+
'A bear growling to a cat'
1819
]
1920
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
2021
embeddings = model.encode(sentences)
@@ -24,8 +25,7 @@
2425
print('Cleaning up the database...')
2526
try:
2627
cursor = conn.cursor()
27-
cursor.execute("DELETE FROM dbo.document_embeddings;")
28-
cursor.execute("DELETE FROM dbo.documents;")
28+
cursor.execute("DELETE FROM dbo.sample_documents;")
2929
cursor.commit();
3030
finally:
3131
cursor.close()
@@ -39,8 +39,7 @@
3939
DECLARE @id INT = ?;
4040
DECLARE @content NVARCHAR(MAX) = ?;
4141
DECLARE @embedding NVARCHAR(MAX) = ?;
42-
INSERT INTO dbo.documents (id, content, embedding) VALUES (@id, @content, @embedding);
43-
INSERT INTO dbo.document_embeddings SELECT @id, CAST([key] AS INT), CAST([value] AS FLOAT) FROM OPENJSON(@embedding);
42+
INSERT INTO dbo.sample_documents (id, content, embedding) VALUES (@id, @content, JSON_ARRAY_TO_VECTOR(@embedding));
4443
""",
4544
id,
4645
content,
@@ -53,37 +52,64 @@
5352

5453
print('Searching for similar documents...')
5554
print('Getting embeddings...')
56-
query = 'growling bear'
55+
query = 'a growling bear'
5756
embedding = model.encode(query)
5857

59-
print('Querying database...')
58+
print(f'Querying database for "{query}"...')
6059
k = 5
6160
try:
6261
cursor = conn.cursor()
6362

6463
results = cursor.execute(f"""
6564
DECLARE @k INT = ?;
65+
DECLARE @q NVARCHAR(4000) = ?;
66+
DECLARE @e VARBINARY(8000) = JSON_ARRAY_TO_VECTOR(CAST(? AS NVARCHAR(MAX)));
6667
WITH keyword_search AS (
6768
SELECT TOP(@k)
6869
id,
69-
ftt.[RANK] AS rank
70-
FROM
71-
dbo.documents
72-
INNER JOIN
73-
FREETEXTTABLE(dbo.documents, *, ?) AS ftt ON dbo.documents.id = ftt.[KEY]
70+
RANK() OVER (ORDER BY rank) AS rank,
71+
content
72+
FROM
73+
(
74+
SELECT TOP(@k)
75+
sd.id,
76+
ftt.[RANK] AS rank,
77+
sd.content
78+
FROM
79+
dbo.sample_documents AS sd
80+
INNER JOIN
81+
FREETEXTTABLE(dbo.sample_documents, *, @q) AS ftt ON sd.id = ftt.[KEY]
82+
) AS t
83+
ORDER BY
84+
rank
7485
),
7586
semantic_search AS
7687
(
77-
SELECT
78-
id,
79-
rank
80-
FROM
81-
dbo.similar_documents(?)
88+
SELECT TOP(@k)
89+
id,
90+
RANK() OVER (ORDER BY distance) AS rank,
91+
content
92+
FROM
93+
(
94+
SELECT TOP(@k)
95+
id,
96+
VECTOR_DISTANCE('cosine', embedding, @e) AS distance,
97+
content
98+
FROM
99+
dbo.sample_documents
100+
ORDER BY
101+
distance
102+
) AS t
103+
ORDER BY
104+
rank
82105
)
83106
SELECT TOP(@k)
84107
COALESCE(ss.id, ks.id) AS id,
85108
COALESCE(1.0 / (@k + ss.rank), 0.0) +
86-
COALESCE(1.0 / (@k + ks.rank), 0.0) AS score -- Reciprocal Rank Fusion (RRF)
109+
COALESCE(1.0 / (@k + ks.rank), 0.0) AS score, -- Reciprocal Rank Fusion (RRF)
110+
COALESCE(ss.content, ks.content) AS content,
111+
ss.rank AS semantic_rank,
112+
ks.rank AS keyword_rank
87113
FROM
88114
semantic_search ss
89115
FULL OUTER JOIN
@@ -97,7 +123,7 @@
97123
)
98124

99125
for row in results:
100-
print(f'Document: {row[0]} -> RRF score: {row[1]:0.4}')
126+
print(f'Document: "{row[2]}", Id: {row[0]} -> RRF score: {row[1]:0.4} (Semantic Rank: {row[3]}, Keyword Rank: {row[4]})')
101127

102128
finally:
103129
cursor.close()

vector-embeddings/02-create-vectors-table.sql

Lines changed: 0 additions & 62 deletions
This file was deleted.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
Add columns to store the native vectors
3+
*/
4+
alter table wikipedia_articles_embeddings
5+
add title_vector_native varbinary(8000);
6+
7+
alter table wikipedia_articles_embeddings
8+
add content_vector_native varbinary(8000);
9+
10+
/*
11+
Update the native vectors
12+
*/
13+
update
14+
wikipedia_articles_embeddings
15+
set
16+
title_vector_native = json_array_to_vector(title_vector),
17+
content_vector_native = json_array_to_vector(content_vector);
18+
go
19+
20+
/*
21+
Remove old columns
22+
*/
23+
alter table wikipedia_articles_embeddings
24+
drop column title_vector;
25+
go
26+
27+
alter table wikipedia_articles_embeddings
28+
drop column content_vector;
29+
go
30+
31+
/*
32+
Rename the columns
33+
*/
34+
EXEC sp_rename 'dbo.wikipedia_articles_embeddings.title_vector_native', 'title_vector_ada2', 'COLUMN';
35+
EXEC sp_rename 'dbo.wikipedia_articles_embeddings.content_vector_native', 'content_vector_ada2', 'COLUMN';
36+
37+
/*
38+
Verify data
39+
*/
40+
select top (100) * from [dbo].[wikipedia_articles_embeddings]
41+
go
42+
43+
select * from [dbo].[wikipedia_articles_embeddings] where title = 'Alan Turing'
44+
go

vector-embeddings/04-create-get-embeddings-procedure.sql

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
create or alter procedure dbo.get_embedding
55
@deployedModelName nvarchar(1000),
66
@inputText nvarchar(max),
7-
@embedding nvarchar(max) output
7+
@embedding varbinary(8000) output
88
as
99
declare @retval int, @response nvarchar(max);
1010
declare @payload nvarchar(max) = json_object('input': @inputText);
@@ -16,12 +16,14 @@ exec @retval = sp_invoke_external_rest_endpoint
1616
@payload = @payload,
1717
@response = @response output;
1818

19-
declare @re nvarchar(max) = '[]';
19+
declare @re nvarchar(max) = null;
2020
if (@retval = 0) begin
2121
set @re = json_query(@response, '$.result.data[0].embedding')
22+
end else begin
23+
select @response as 'Error message from OpenAI API';
2224
end
2325

24-
set @embedding = @re;
26+
set @embedding = json_array_to_vector(@re);
2527

2628
return @retval
2729
go

0 commit comments

Comments
 (0)
0