8000 first commit · 7effrey89/azure-sql-db-openai@34e4abb · GitHub
[go: up one dir, main page]

Skip to content

Commit 34e4abb

Browse files
committed
first commit
1 parent 0418356 commit 34e4abb

File tree

5 files changed

+224
-53
lines changed

5 files changed

+224
-53
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,3 +396,6 @@ FodyWeavers.xsd
396396

397397
# JetBrains Rider
398398
*.sln.iml
399+
400+
# Customer
401+
readme.local.txt

README.md

Lines changed: 4 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,8 @@
1-
# Project Name
1+
# Vector Similarity Search with Azure SQL & Azure OpenAI
22

3-
(short, 1-3 sentenced, description of the project)
3+
This example shows how to use Azure OpenAI from Azure SQL database to get the vector embeddings of any choose text, and then calculate the cosine distance against the Wikipedia articles (for which vector embeddings have been already calculated) to find the articles that covers topics that are close - or similar - to the searched text.
44

5-
## Features
5+
Azure SQL database can be used to significatly speed up vectors operations using column store indexes, so that search can have sub-seconds performances even on large datasets.
66

7-
This project framework provides the following features:
7+
Download the [wikipedia embeedings from here](https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip), unzip it and upload it to an Azure Blob Storage container.
88

9-
* Feature 1
10-
* Feature 2
11-
* ...
12-
13-
## Getting Started
14-
15-
### Prerequisites
16-
17-
(ideally very short, if any)
18-
19-
- OS
20-
- Library version
21-
- ...
22-
23-
### Installation
24-
25-
(ideally very short)
26-
27-
- npm install [package name]
28-
- mvn install
29-
- ...
30-
31-
### Quickstart
32-
(Add steps to get up and running quickly)
33-
34-
1. git clone [repository clone url]
35-
2. cd [repository name]
36-
3. ...
37-
38-
39-
## Demo
40-
41-
A demo app is included to show how to use the project.
42-
43-
To run the demo, follow these steps:
44-
45-
(Add steps to start up the demo)
46-
47-
1.
48-
2.
49-
3.
50-
51-
## Resources
52-
53-
(Any additional resources or related projects)
54-
55-
- Link to supporting information
56-
- Link to similar sample
57-
- ...
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
SETUP
3+
*/
4+
5+
/*
6+
Cleanup if needed
7+
*/
8+
if not exists(select * from sys.symmetric_keys where [name] = '##MS_DatabaseMasterKey##')
9+
begin
10+
create master key encryption by password = 'Pa$$w0rd!'
11+
end
12+
go
13+
if exists(select * from sys.[external_data_sources] where name = 'openai_playground')
14+
begin
15+
drop external data source [openai_playground];
16+
end
17+
go
18+
if exists(select * from sys.[database_scoped_credentials] where name = 'openai_playground')
19+
begin
20+
drop database scoped credential [openai_playground];
21+
end
22+
go
23+
24+
/*
25+
Create database scoped credential and external data source
26+
*/
27+
create database scoped credential [openai_playground]
28+
with identity = 'SHARED ACCESS SIGNATURE',
29+
secret = '<sas-token';
30+
go
31+
create external data source [openai_playground]
32+
with
33+
(
34+
type = blob_storage,
35+
location = 'https://dmstore3.blob.core.windows.net/playground',
36+
credential = [openai_playground]
37+
);
38+
go
39+
40+
/*
41+
Create table
42+
*/
43+
drop table if exists [dbo].[wikipedia_articles_embeddings];
44+
create table [dbo].[wikipedia_articles_embeddings]
45+
(
46+
[id] [int] not null,
47+
[url] [varchar](1000) not null,
48+
[title] [varchar](1000) not null,
49+
[text] [varchar](max) not null,
50+
[title_vector] [varchar](max) not null,
51+
[content_vector] [varchar](max) not null,
52+
[vector_id] [int] not null
53+
)
54+
go
55+
56+
/*
57+
Import data
58+
*/
59+
bulk insert dbo.[wikipedia_articles_embeddings]
60+
from 'wikipedia/vector_database_wikipedia_articles_embedded.csv'
61+
with (
62+
data_source = 'openai_playground',
63+
format = 'csv',
64+
firstrow = 2,
65+
codepage = '65001',
66+
fieldterminator = ',',
67+
rowterminator = '0x0a',
68+
fieldquote = '"',
69+
batchsize = 1000,
70+
tablock
71+
)
72+
go
73+
74+
/*
75+
Add primary key
76+
*/
77+
alter table [dbo].[wikipedia_articles_embeddings]
78+
add constraint pk__wikipedia_articles_embeddings primary key nonclustered (id)
79+
go
80+
81+
/*
82+
Verify data
83+
*/
84+
select top (100) * from [dbo].[wikipedia_articles_embeddings]
85+
go
86+
87+
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
Extract the title vectors from the JSON and store them in a table
3+
*/
4+
drop table if exists dbo.wikipedia_articles_embeddings_titles_vector;
5+
with cte as
6+
(
7+
select
8+
v.id as article_id,
9+
cast(tv.[key] as int) as vector_value_id,
10+
cast(tv.[value] as float) as vector_value
11+
from
12+
[dbo].[wikipedia_articles_embeddings] as v
13+
cross apply
14+
openjson(title_vector) tv
15+
)
16+
select
17+
article_id,
18+
vector_value_id,
19+
vector_value
20+
into
21+
dbo.wikipedia_articles_embeddings_titles_vector
22+
from
23+
cte;
24+
go
25+
26+
/*
27+
Extract the content vectors from the JSON and store them in a table
28+
*/
29+
drop table if exists dbo.wikipedia_articles_embeddings_contents_vector;
30+
with cte as
31+
(
32+
select
33+
v.id as article_id,
34+
cast(tv.[key] as int) as vector_value_id,
35+
cast(tv.[value] as float) as vector_value
36+
from
37+
[dbo].[wikipedia_articles_embeddings] as v
38+
cross apply
39+
openjson(content_vector) tv
40+
)
41+
select
42+
article_id,
43+
vector_value_id,
44+
vector_value
45+
into
46+
dbo.wikipedia_articles_embeddings_contents_vector
47+
from
48+
cte;
49+
go
50+
51+
/*
52+
Create columnstores to support vector operations
53+
*/
54+
create clustered columnstore index ixc
55+
on dbo.wikipedia_articles_embeddings_titles_vector
56+
order (article_id);
57+
go
58+
59+
create clustered columnstore index ixc
60+
on dbo.wikipedia_articles_embeddings_contents_vector
61+
order (article_id);
62+
go
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
Get the embeddings for the input text by calling the OpenAI API
3+
*/
4+
declare @inputText nvarchar(max) = 'the foundation series by isaac asimov';
5+
declare @retval int, @response nvarchar(max);
6+
declare @payload nvarchar(max) = json_object('input': @inputText);
7+
exec @retval = sp_invoke_external_rest_endpoint
8+
@url = 'https://dm-open-ai.openai.azure.com/openai/deployments/embeddings/embeddings?api-version=2023-03-15-preview',
9+
@method = 'POST',
10+
@headers = '{"api-key":"7194a8c183be4cd08c514834f4e985ea"}',
11+
@payload = @payload,
12+
@response = @response output;
13+
drop table if exists #response;
14+
select @response as [response] into #response;
15+
select * from #response;
16+
go
17+
18+
/*
19+
Extract the title vectors from the JSON and store them in a table
20+
*/
21+
declare @response nvarchar(max) = (select response from #response);
22+
drop table if exists #t;
23+
select
24+
cast([key] as int) as [vector_value_id],
25+
cast([value] as float) as [vector_value]
26+
into
27+
#t
28+
from
29+
openjson(@response, '$.result.data[0].embedding')
30+
go
31+
select * from #t;
32+
go
33+
34+
/*
35+
Calculate cosine distance between the input text and all the articles
36+
*/
37+
drop table if exists #results;
38+
select top(50)
39+
v2.article_id,
40+
SUM(v1.[vector_value] * v2.[vector_value]) /
41+
(
42+
SQRT(SUM(v1.[vector_value] * v1.[vector_value]))
43+
*
44+
SQRT(SUM(v2.[vector_value] * v2.[vector_value]))
45+
) as cosine_distance
46+
into
47+
#results
48+
from
49+
#t v1
50+
inner join
51+
dbo.wikipedia_articles_embeddings_contents_vector v2 on v1.vector_value_id = v2.vector_value_id
52+
group by
53+
v2.article_id
54+
order by
55+
cosine_distance desc;
56+
57+
select
58+
a.id,
59+
a.title,
60+
a.url,
61+
r.cosine_distance
62+
from
63+
#results r
64+
inner join
65+
dbo.wikipedia_articles_embeddings a on r.article_id = a.id
66+
order by
67+
cosine_distance desc;
68+
go

0 commit comments

Comments
 (0)
0