10000 Merge branch 'main' of https://github.com/sqlparser/python_data_lineage · sqlparser/python_data_lineage@bf774cc · GitHub
[go: up one dir, main page]

Skip to content

Commit bf774cc

Browse files
committed
2 parents 8d60129 + 0dbf453 commit bf774cc

File tree

1 file changed

+109
-57
lines changed

1 file changed

+109
-57
lines changed

dlineage.py

Lines changed: 109 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,34 @@
11
# python3
2+
import os
23
import webbrowser
34
import jpype
45
import sys
6+
7+
def get_file_character_count(file_path):
8+
character_count = 0
9+
with open(file_path, "r") as file:
10+
try:
11+
content = file.read()
12+
character_count = len(content)
13+
except:
14+
print(file_path + " is not a text file.")
15+
return character_count
16+
17+
def get_all_files(folder_path):
18+
all_files = []
19+
for root, dirs, files in os.walk(folder_path):
20+
for file in files:
21+
file_path = os.path.join(root, file)
22+
all_files.append(file_path)
23+
return all_files
24+
def get_text_files_character_count(folder_path):
25+
text_files = get_all_files(folder_path)
26+
total_character_count = 0
27+
for file_path in text_files:
28+
character_count = get_file_character_count(file_path)
29+
total_character_count += character_count
30+
return total_character_count
31+
532
def indexOf(args, arg):
633
try:
734
return args.index(arg)
@@ -12,6 +39,7 @@ def save_to_file(file_name, contents):
1239
fh = open(file_name, 'w')
1340
fh.write(contents)
1441
fh.close()
42+
1543
def call_dataFlowAnalyzer(args):
1644
# Start the Java Virtual Machine (JVM)
1745
widget_server_url = "http://localhost:8000"
@@ -27,30 +55,43 @@ def call_dataFlowAnalyzer(args):
2755
XML2Model = jpype.JClass("gudusoft.gsqlparser.dlineage.util.XML2Model")
2856
RemoveDataflowFunction = jpype.JClass("gudusoft.gsqlparser.dlineage.util.RemoveDataflowFunction")
2957
File = jpype.JClass("java.io.File")
30-
sqlFiles = None
3158
EDbVendor = jpype.JClass("gudusoft.gsqlparser.EDbVendor")
3259
vendor = EDbVendor.dbvoracle
3360
index = indexOf(args, "/t")
3461
if index != -1 and len(args) > index + 1:
35-
vendor = TGSqlParser.getDBVendorByName(args[index + 1])
62+
vendor = TGSqlParser.getDBVendorByName(args[index + 1])
3663
if indexOf(args, "/version") != -1:
37-
print("Version: " + DataFlowAnalyzer.getVersion())
38-
print("Release Date: " + DataFlowAnalyzer.getReleaseDate())
39-
return
64+
print("Version: " + DataFlowAnalyzer.getVersion())
65+
print("Release Date: " + DataFlowAnalyzer.getReleaseDate())
66+
return
4067

4168
if indexOf(args, "/f") != -1 and len(args) > indexOf(args, "/f") + 1:
42-
sqlFiles = File(args[indexOf(args, "/f") + 1])
43-
if not sqlFiles.exists() or not sqlFiles.isFile():
44-
print(args[indexOf(args, "/f") + 1] + " is not a valid file.")
45-
return
69+
sqlFiles = File(args[indexOf(args, "/f") + 1])
70+
if not sqlFiles.exists() or not sqlFiles.isFile():
71+
print(args[indexOf(args, "/f") + 1] + " is not a valid file.")
72+
return
73+
74+
character_count = get_file_character_count(args[indexOf(args, "/f") + 1])
75+
if character_count > 10000:
76+
print("SQLFlow lite version only supports processing SQL statements with a maximum of 10,"
77+
"000 characters. If you need to process SQL statements without length restrictions, "
78+
"please contact support@gudusoft.com for more information.")
79+
return
80+
4681
elif indexOf(args, "/d") != -1 and len(args) > indexOf(args, "/d") + 1:
47-
sqlFiles = File(args[indexOf(args, "/d") + 1])
48-
if not sqlFiles.exists() or not sqlFiles.isDirectory():
49-
print(args[indexOf(args, "/d") + 1] + " is not a valid directory.")
50-
return
82+
sqlFiles = File(args[indexOf(args, "/d") + 1])
83+
if not sqlFiles.exists() or not sqlFiles.isDirectory():
84+
print(args[indexOf(args, "/d") + 1] + " is not a valid directory.")
85+
return
86+
character_count = get_text_files_character_count(args[indexOf(args, "/d") + 1])
87+
if character_count > 10000:
88+
print("SQLFlow lite version only supports processing SQL statements with a maximum of 10,"
89+
"000 characters. If you need to process SQL statements without length restrictions, "
90+
"please contact support@gudusoft.com for more information.&qu F438 ot;)
91+
return
5192
else:
52-
print("Please specify a sql file path or directory path to analyze dlineage.")
53-
return
93+
print("Please specify a sql file path or directory path to analyze dlineage.")
94+
return
5495
simple = indexOf(args, "/s") != -1
5596
ignoreTemporaryTable = indexOf(args, "/withTemporaryTable") == -1
5697
ignoreResultSets = indexOf(args, "/i") != -1
@@ -77,7 +118,8 @@ def call_dataFlowAnalyzer(args):
77118
topselectlist = True
78119
tableLineage = indexOf(args, "/tableLineage") != -1
79120
csv = indexOf(args, "/csv") != -1
80-
delimiter = args.get(indexOf(args, "/delimiter") + 1) if indexOf(args, "/delimiter") != -1 and len(args) > indexOf(args, "/delimiter") + 1 else ","
121+
delimiter = args.get(indexOf(args, "/delimiter") + 1) if indexOf(args, "/delimiter") != -1 and len(
122+
args) > indexOf(args, "/delimiter") + 1 else ","
81123
if tableLineage:
82124
simple = False
83125
ignoreResultSets = False
@@ -91,7 +133,7 @@ def call_dataFlowAnalyzer(args):
91133
SQLUtil = jpype.JClass("gudusoft.gsqlparser.util.SQLUtil")
92134
envs = jsonSQLEnvParser.parseSQLEnv(vendor, SQLUtil.getFileContent(metadataFile))
93135
if envs != None and envs.length > 0:
94-
sqlenv = envs[0]
136+
sqlenv = envs[0]
95137
dlineage = DataFlowAnalyzer(sqlFiles, vendor, simple)
96138
if sqlenv != None:
97139
dlineage.setSqlEnv(sqlenv)
@@ -100,7 +142,7 @@ def call_dataFlowAnalyzer(args):
100142
dlineage.setShowJoin(showJoin)
101143
dlineage.setIgnoreRecordSet(ignoreResultSets)
102144
if ignoreResultSets and not ignoreFunction:
103-
dlineage.setSimpleShowFunction(True)
145+
dlineage.setSimpleShowFunction(True)
104146
dlineage.setLinkOrphanColumnToFirstTable(linkOrphanColumnToFirstTable)
105147
dlineage.setIgnoreCoordinate(ignoreCoordinate)
106148
dlineage.setSimpleShowTopSelectResultSet(topselectlist)
@@ -117,16 +159,14 @@ def call_dataFlowAnalyzer(args):
117159
dlineage.getOption().setDefaultSchema(args[indexOf(args, "/defaultSchema") + 1])
118160
if indexOf(args, "/showResultSetTypes") != -1:
119161
resultSetTypes = args[indexOf(args, "/showResultSetTypes") + 1]
120-
if resultSetTypes != None:
162+
if resultSetTypes is not None:
121163
dlineage.getOption().showResultSetTypes(resultSetTypes.split(","))
122164

123165
if indexOf(args, "/filterRelationTypes") != -1:
124166
dlineage.getOption().filterRelationTypes(args[indexOf(args, "/filterRelationTypes") + 1])
125167
if simple and not jsonFormat:
126168
dlineage.setTextFormat(textFormat)
127169

128-
result = None
129-
dataflow = None
130170
if indexOf(args, "/er") != -1:
131171
dlineage.getOption().setShowERDiagram(True)
132172
dlineage.generateDataFlow()
@@ -135,7 +175,7 @@ def call_dataFlowAnalyzer(args):
135175
generator = DataFlowGraphGenerator()
136176
result = generator.genERGraph(vendor, dataflow)
137177
save_to_file("widget/json/erGraph.json", str(result))
138-
webbrowser.open_new(widget_server_url+"/er.html")
178+
webbrowser.open_new(widget_server_url + "/er.html")
139179
return
140180
elif tableLineage:
141181
dlineage.generateDataFlow()
@@ -192,40 +232,52 @@ def call_dataFlowAnalyzer(args):
192232
if __name__ == "__main__":
193233
args = sys.argv
194234
if len(args) < 2:
195-
print("Usage: java DataFlowAnalyzer [/f <path_to_sql_file>] [/d <path_to_directory_includes_sql_files>] [/stat] [/s [/topselectlist] [/text] [/withTemporaryTable]] [/i] [/showResultSetTypes <resultset_types>] [/ic] [/lof] [/j] [/json] [/traceView] [/t <database type>] [/o <output file path>] [/version] [/env <path_to_metadata.json>] [/tableLineage [/csv [/delimeter <delimeter>]]] [/transform [/coor]] [/showConstant] [/treatArgumentsInCountFunctionAsDirectDataflow] [/filterRelationTypes <relationTypes>]")
196-
print("/f: Optional, the full path to SQL file.")
197-
print("/d: Optional, the full path to the directory includes the SQL files.")
198-
print("/j: Optional, return the result including the join relation.")
199-
print("/s: Optional, simple output, ignore the intermediate results.")
200-
print("/topselectlist: Optional, simple output with top select results.")
201-
print("/withTemporaryTable: Optional, simple output with the temporary tables.")
202-
print("/i: Optional, the same as /s option, but will keep the resultset generated by the SQL function, this parameter will have the same effect as /s /topselectlist + keep resultset generated by the sql function.")
203-
print("/showResultSetTypes: Optional, simple output with specify resultset types, separate with commas, resultset types contains array, struct, result_of, cte, insert_select, update_select, merge_update, merge_insert, output, update_set,\r\n"
204-
+ " pivot_table, unpivot_table, alias, rs, function, case_when")
205-
print("/if: Optional, keep all the intermediate resultset, but remove the resultset generated by the SQL function")
206-
print("/ic: Optional, ignore the coordinates in the output.")
207-
print("/lof: Option, link orphan column to the first table.")
208-
print("/traceView: Optional, only output the name of source tables and views, ignore all intermedidate data.")
209-
print("/text: Optional, this option is valid only /s is used, output the column dependency in text mode.")
210-
print("/json: Optional, print the json format output.")
211-
print("/tableLineage [/csv /delimiter]: Optional, output tabel level lineage.")
212-
print("/csv: Optional, output column level lineage in csv format.")
213-
print("/delimiter: Optional, the delimiter of output column level lineage in csv format.")
214-
print("/t: Option, set the database type. "
215-
+ "Support access,bigquery,couchbase,dax,db2,greenplum,hana,hive,impala,informix,mdx,mssql,\n"
216-
+ "sqlserver,mysql,netezza,odbc,openedge,oracle,postgresql,postgres,redshift,snowflake,\n"
217-
+ "sybase,teradata,soql,vertica\n, " + "the default value is oracle")
218-
print("/env: Optional, specify a metadata.json to get the database metadata information.")
219-
print("/transform: Optional, output the relation transform code.")
220-
print("/coor: Optional, output the relation transform coordinate, but not the code.")
221-
print("/defaultDatabase: Optional, specify the default schema.")
222-
print("/defaultSchema: Optional, specify the default schema.")
223-
print("/showImplicitSchema: Optional, show implicit schema.")
224-
print("/showConstant: Optional, show constant table.")
225-
print("/treatArgumentsInCountFunctionAsDirectDataflow: Optional, treat arguments in count function as direct dataflow.")
226-
print("/filterRelationTypes: Optional, support fdd, fdr, join, call, er, multiple relatoin types separated by commas")
227-
print("/graph: Optional, Open a browser page to graphically display the results")
228-
print("/er: Optional, Open a browser page and display the ER diagram graphically")
229-
sys.exit(0)
235+
print("Usage: java DataFlowAnalyzer [/f <path_to_sql_file>] [/d <path_to_directory_includes_sql_files>] ["
236+
"/stat] [/s [/topselectlist] [/text] [/withTemporaryTable]] [/i] [/showResultSetTypes "
237+
"<resultset_types>] [/ic] [/lof] [/j] [/json] [/traceView] [/t <database type>] [/o <output file path>] "
238+
"[/version] [/env <path_to_metadata.json>] [/tableLineage [/csv [/delimeter <delimeter>]]] [/transform "
239+
"[/coor]] [/showConstant] [/treatArgumentsInCountFunctionAsDirectDataflow] [/filterRelationTypes "
240+
"<relationTypes>]")
241+
print("/f: Optional, the full path to SQL file.")
242+
print("/d: Optional, the full path to the directory includes the SQL files.")
243+
print("/j: Optional, return the result including the join relation.")
244+
print("/s: Optional, simple output, ignore the intermediate results.")
245+
print("/topselectlist: Optional, simple output with top select results.")
246+
print("/withTemporaryTable: Optional, simple output with the temporary tables.")
247+
print("/i: Optional, the same as /s option, but will keep the resultset generated by the SQL function, "
248+
"this parameter will have the same effect as /s /topselectlist + keep resultset generated by the sql "
249+
"function.")
250+
print("/showResultSetTypes: Optional, simple output with specify resultset types, separate with commas, "
251+
"resultset types contains array, struct, result_of, cte, insert_select, update_select, merge_update, "
252+
"merge_insert, output, update_set,\r\n"
253+
+ " pivot_table, unpivot_table, alias, rs, function, case_when")
254+
print("/if: Optional, keep all the intermediate resultset, but remove the resultset generated by the SQL "
255+
"function")
256+
print("/ic: Optional, ignore the coordinates in the output.")
257+
print("/lof: Option, link orphan column to the first table.")
258+
print("/traceView: Optional, only output the name of source tables and views, ignore all intermedidate data.")
259+
print("/text: Optional, this option is valid only /s is used, output the column dependency in text mode.")
260+
print("/json: Optional, print the json format output.")
261+
print("/tableLineage [/csv /delimiter]: Optional, output tabel level lineage.")
262+
print("/csv: Optional, output column level lineage in csv format.")
263+
print("/delimiter: Optional, the delimiter of output column level lineage in csv format.")
264+
print("/t: Option, set the database type. "
265+
+ "Support access,bigquery,couchbase,dax,db2,greenplum,hana,hive,impala,informix,mdx,mssql,\n"
266+
+ "sqlserver,mysql,netezza,odbc,openedge,oracle,postgresql,postgres,redshift,snowflake,\n"
267+
+ "sybase,teradata,soql,vertica\n, " + "the default value is oracle")
268+
print("/env: Optional, specify a metadata.json to get the database metadata information.")
269+
print("/transform: Optional, output the relation transform code.")
270+
print("/coor: Optional, output the relation transform coordinate, but not the code.")
271+
print("/defaultDatabase: Optional, specify the default schema.")
272+
print("/defaultSchema: Optional, specify the default schema.")
273+
print("/showImplicitSchema: Optional, show implicit schema.")
274+
print("/showConstant: Optional, show constant table.")
275+
print("/treatArgumentsInCountFunctionAsDirectDataflow: Optional, treat arguments in count function as direct "
276+
"dataflow.")
277+
print("/filterRelationTypes: Optional, support fdd, fdr, join, call, er, multiple relatoin types separated by "
278+
"commas")
279+
print("/graph: Optional, Open a browser page to graphically display the results")
280+
print("/er: Optional, Open a browser page and display the ER diagram graphically")
281+
sys.exit(0)
230282

231283
call_dataFlowAnalyzer(args)

0 commit comments

Comments
 (0)
0