10000 added Reciprocal Rank Fusion (RRF) sample · yang-jiayi/azure-sql-db-openai@ea3cb2c · GitHub
[go: up one dir, main page]

Skip to content

Commit ea3cb2c

Browse files
committed
added Reciprocal Rank Fusion (RRF) sample
1 parent c6c4a30 commit ea3cb2c

File tree

5 files changed

+149
-2
lines changed

5 files changed

+149
-2
lines changed

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ FodyWeavers.xsd
397397
# JetBrains Rider
398398
*.sln.iml
399399

400-
# Customer
400+
# Custom
401401
*.local.txt
402-
*.local.sql
402+
*.local.sql
403+
.venv/

python/00-setup-database.sql

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
DROP TABLE IF EXISTS dbo.document_embeddings
2+
DROP TABLE IF EXISTS dbo.documents
3+
go
4+
5+
CREATE TABLE dbo.documents (id INT CONSTRAINT pk__documents PRIMARY KEY IDENTITY, content NVARCHAR(MAX), embedding NVARCHAR(MAX))
6+
CREATE TABLE dbo.document_embeddings (id INT REFERENCES dbo.documents(id), vector_value_id INT, vector_value FLOAT)
7+
go
8+
9+
CREATE CLUSTERED COLUMNSTORE INDEX csi__document_embeddings ON dbo.document_embeddings ORDER (id)
10+
go
11+
12+
IF NOT EXISTS(SELECT * FROM sys.fulltext_catalogs WHERE [name] = 'FullTextCatalog')
13+
BEGIN
14+
CREATE FULLTEXT CATALOG [FullTextCatalog] AS DEFAULT;
15+
END
16+
go
17+
18+
CREATE FULLTEXT INDEX ON dbo.documents (content) KEY INDEX pk__documents;
19+
go
20+
21+
ALTER FULLTEXT INDEX ON dbo.documents ENABLE;
22+
go

python/hybrid_search.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import os
2+
import pyodbc
3+
import logging
4+
import json
5+
from sentence_transformers import SentenceTransformer
6+
from dotenv import load_dotenv
7+
from utilities import get_mssql_connection
8+
9+
load_dotenv()
10+
11+
if __name__ == '__main__':
12+
print('Initializing sample...')
13+
print('Getting embeddings...')
14+
sentences = [
15+
'The dog is barking',
16+
'The cat is purring',
17+
'The bear is growling'
18+
]
19+
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
20+
embeddings = model.encode(sentences)
21+
22+
print('Cleaning up the database...')
23+
try:
24+
conn = get_mssql_connection()
25+
conn.execute("DELETE FROM dbo.document_embeddings;")
26+
conn.execute("DELETE FROM dbo.documents;")
27+
conn.commit();
28+
finally:
29+
conn.close()
30+
31+
print('Saving documents and embeddings in the database...')
32+
try:
33+
conn = get_mssql_connection()
34+
cursor = conn.cursor()
35+
36+
for content, embedding in zip(sentences, embeddings):
37+
cursor.execute(f"""
38+
INSERT INTO dbo.documents (content, embedding) VALUES (?, ?);
39+
INSERT INTO dbo.document_embeddings SELECT SCOPE_IDENTITY(), CAST([key] AS INT), CAST([value] AS FLOAT) FROM OPENJSON(?);
40+
""",
41+
content,
42+
json.dumps(embedding.tolist()),
43+
json.dumps(embedding.tolist())
44+
)
45+
46+
cursor.close()
47+
conn.commit()
48+
finally:
49+
conn.close()
50+
51+
print('Searching for similar documents...')
52+
print('Getting embeddings...')
53+
query = 'growling bear'
54+
embedding = model.encode(query)
55+
56+
print('Querying database...')
57+
k = 5
58+
try:
59+
conn = get_mssql_connection()
60+
cursor = conn.cursor()
61+
62+
results = cursor.execute(f"""
63+
DECLARE @k INT = ?;
64+
WITH keyword_search AS (
65+
SELECT TOP(@k)
66+
id,
67+
ftt.[RANK] AS rank
68+
FROM
69+
dbo.documents
70+
INNER JOIN
71+
FREETEXTTABLE(dbo.documents, *, ?) AS ftt ON dbo.documents.id = ftt.[KEY]
72+
),
73+
semantic_search AS
74+
(
75+
SELECT
76+
id,
77+
rank
78+
FROM
79+
dbo.similar_documents(?)
80+
)
81+
SELECT TOP(@k)
82+
COALESCE(ss.id, ks.id) AS id,
83+
COALESCE(1.0 / (@k + ss.rank), 0.0) +
84+
COALESCE(1.0 / (@k + ks.rank), 0.0) AS score -- Reciprocal Rank Fusion (RRF)
85+
FROM
86+
semantic_search ss
87+
FULL OUTER JOIN
88+
keyword_search ks ON ss.id = ks.id
89+
ORDER BY
90+
score DESC
91+
""",
92+
k,
93+
query,
94+
json.dumps(embedding.tolist()),
95+
)
96+
97+
for row in results:
98+
print('document:', row[0], 'RRF score:', row[1])
99+
100+
cursor.close()
101+
conn.commit()
102+
finally:
103+
conn.close()

python/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
python-dotenv
2+
pyodbc
3+
azure-identity
4+
sentence-transformers

python/utilities.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import os
2+
import pyodbc
3+
import struct
4+
import logging
5+
from azure import identity
6+
7+
def get_mssql_connection():
8+
logging.info('Getting MSSQL connection')
9+
logging.info(' - Getting EntraID credentials...')
10+
mssql_connection_string = os.environ["MSSQL"]
11+
credential = identity.DefaultAzureCredential(exclude_interactive_browser_credential=False)
12+
token_bytes = credential.get_token("https://database.windows.net/.default").token.encode("UTF-16-LE")
13+
token_struct = struct.pack(f'<I{len(token_bytes)}s', len(token_bytes), token_bytes)
14+
SQL_COPT_SS_ACCESS_TOKEN = 1256 # This connection option is defined by microsoft in msodbcsql.h
15+
logging.info(' - Connecting to MSSQL...')
16+
conn = pyodbc.connect(mssql_connection_string, attrs_before={SQL_COPT_SS_ACCESS_TOKEN: token_struct})
17+
return conn

0 commit comments

Comments
 (0)
0