8000 Fix large downloads (#105) (#111) · v0ku/server-client-python@9d0c8ca · GitHub
[go: up one dir, main page]

Skip to content

Commit 9d0c8ca

Browse files
t8y8Russell Hay
authored andcommitted
Fix large downloads (tableau#105) (tableau#111)
Large responses were being read into memory. For most calls that's fine, but download could cause the python process to go out of memory due to holding large workbooks or datasources all in memory before writing to disk. Requests has a feature called `iter_content` which when used in combination with `stream=True` on a request will download only the headers, allow us to determine the filename, and then read through the response body in chunks. I picked a size of 1024 bytes, since that's what most of the internet appears to use and I noticed little perf difference between a 1024 byte chunk size and a 1MB chunk size. This is all enabled by exposing the `parameters` argument to `requests.get` by pluming it through our wrapper functions. All tests pass, and manual testing showed the memory problem went away.
1 parent 65ce464 commit 9d0c8ca

File tree

3 files changed

+31
-24
lines changed

3 files changed

+31
-24
lines changed

tableauserverclient/server/endpoint/datasources_endpoint.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import copy
88
import cgi
9+
from contextlib import closing
910

1011
# The maximum size of a file that can be published in a single request is 64MB
1112
FILESIZE_LIMIT = 1024 * 1024 * 64 # 64MB
@@ -64,16 +65,18 @@ def download(self, datasource_id, filepath=None):
6465
error = "Datasource ID undefined."
6566
raise ValueError(error)
6667
url = "{0}/{1}/content".format(self.baseurl, datasource_id)
67-
server_response = self.get_request(url)
68-
_, params = cgi.parse_header(server_response.headers['Content-Disposition'])
69-
filename = os.path.basename(params['filename'])
70-
if filepath is None:
71-
filepath = filename
72-
elif os.path.isdir(filepath):
73-
filepath = os.path.join(filepath, filename)
74-
75-
with open(filepath, 'wb') as f:
76-
f.write(server_response.content)
68+
with closing(self.get_request(url, parameters={'stream': True})) as server_response:
69+
_, params = cgi.parse_header(server_response.headers['Content-Disposition'])
70+
filename = os.path.basename(params['filename'])
71+
if filepath is None:
72+
filepath = filename
73+
elif os.path.isdir(filepath):
74+
filepath = os.path.join(filepath, filename)
75+
76+
with open(filepath, 'wb') as f:
77+
for chunk in server_response.iter_content(1024): # 1KB
78+
f.write(chunk)
79+
7780
logger.info('Downloaded datasource to {0} (ID: {1})'.format(filepath, datasource_id))
7881
return os.path.abspath(filepath)
7982

tableauserverclient/server/endpoint/endpoint.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ def _make_common_headers(auth_token, content_type):
2121

2222
return headers
2323

24-
def _make_request(self, method, url, content=None, request_object=None, auth_token=None, content_type=None):
24+
def _make_request(self, method, url, content=None, request_object=None,
25+
auth_token=None, content_type=None, parameters=None):
2526
if request_object is not None:
2627
url = request_object.apply_query_params(url)
27-
parameters = {}
28+
parameters = parameters or {}
2829
parameters.update(self.parent_srv.http_options)
2930
parameters['headers'] = Endpoint._make_common_headers(auth_token, content_type)
3031

@@ -49,9 +50,9 @@ def _check_status(server_response):
4950
def get_unauthenticated_request(self, url, request_object=None):
5051
return self._make_request(self.parent_srv.session.get, url, request_object=request_object)
5152

52-
def get_request(self, url, request_object=None):
53+
def get_request(self, url, request_object=None, parameters=None):
5354
return self._make_request(self.parent_srv.session.get, url, auth_token=self.parent_srv.auth_token,
54-
request_object=request_object)
55+
request_object=request_object, parameters=parameters)
5556

5657
def delete_request(self, url):
5758
# We don't return anything for a delete

tableauserverclient/server/endpoint/workbooks_endpoint.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import logging
88
import copy
99
import cgi
10+
from contextlib import closing
1011

1112
# The maximum size of a file that can be published in a single request is 64MB
1213
FILESIZE_LIMIT = 1024 * 1024 * 64 # 64MB
@@ -92,16 +93,18 @@ def download(self, workbook_id, filepath=None):
9293
error = "Workbook ID undefined."
9394
raise ValueError(error)
9495
url = "{0}/{1}/content".format(self.baseurl, workbook_id)
95-
server_response = self.get_request(url)
96-
_, params = cgi.parse_header(server_response.headers['Content-Disposition'])
97-
filename = os.path.basename(params['filename'])
98-
if filepath is None:
99-
filepath = filename
100-
elif os.path.isdir(filepath):
101-
filepath = os.path.join(filepath, filename)
102-
103-
with open(filepath, 'wb') as f:
104-
f.write(server_response.content)
96+
97+
with closing(self.get_request(url, parameters={"stream": True})) as server_response:
98+
_, params = cgi.parse_header(server_response.headers['Content-Disposition'])
99+
filename = os.path.basename(params['filename'])
100+
if filepath is None:
101+
filepath = filename
102+
elif os.path.isdir(filepath):
103+
filepath = os.path.join(filepath, filename)
104+
105+
with open(filepath, 'wb') as f:
106+
for chunk in server_response.iter_content(1024): # 1KB
107+
f.write(chunk)
105108
logger.info('Downloaded workbook to {0} (ID: {1})'.format(filepath, workbook_id))
106109
return os.path.abspath(filepath)
107110

0 commit comments

Comments
 (0)
0