8000 rework asset archiver to generic object inheritence format · homeylab/bookstack-file-exporter@1e4f870 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1e4f870

Browse files
committed
rework asset archiver to generic object inheritence format
1 parent 4110e15 commit 1e4f870

File tree

9 files changed

+282
-163
lines changed

9 files changed

+282
-163
lines changed

.dockerignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ cython_debug/
162162

163163
## Local
164164
local/
165+
.vscode/
166+
.github/
165167

166168
## test outputs
167169
bkps/

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ assets:
8585
### Run via Pip
8686
The exporter can be installed via pip and run directly.
8787
88+
#### Python Version
89+
_Note: This application is tested and developed on Python version `3.12.X`. The min required version is >= `3.8` but is recommended to install (or set up a venv) a `3.12.X` version._
90+
8891
#### Examples
8992
```bash
9093
python -m pip install bookstack-file-exporter
@@ -114,10 +117,7 @@ export LOG_LEVEL=debug
114117
python -m bookstack_file_exporter -c <path_to_config_file>
115118
```
116119

117-
#### Python Version
118-
_Note: This application is tested and developed on Python version `3.12.X`. The min required version is >= `3.8` but is recommended to install (or set up a venv) a `3.12.X` version._
119-
120-
### Run Via Docker
120+
### Run via Docker
121121
Docker images are provided for `linux/amd64` and `linux/arm64` variants only at the moment. If another variant is required, please request it via Github Issue.
122122

123123
#### Examples
@@ -239,7 +239,7 @@ More descriptions can be found for each section below:
239239

240240
#### Valid Environment Variables
241241
General
242-
- `LOG_LEVEL`: default: `info``. Provide a valid log level: info, debug, warning, error.
242+
- `LOG_LEVEL`: default: `info`. Provide a valid log level: info, debug, warning, error.
243243

244244
[Bookstack Credentials](#authentication)
245245
- `BOOKSTACK_TOKEN_ID`

bookstack_file_exporter/archiver/archiver.py

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from bookstack_file_exporter.exporter.node import Node
77
from bookstack_file_exporter.archiver import util
8-
from bookstack_file_exporter.archiver.page_archiver import PageArchiver, ImageNode
8+
from bookstack_file_exporter.archiver.page_archiver import PageArchiver
99
from bookstack_file_exporter.archiver.minio_archiver import MinioArchiver
1010
from bookstack_file_exporter.config_helper.remote import StorageProviderConfig
1111
from bookstack_file_exporter.config_helper.config_helper import ConfigNode
@@ -17,7 +17,7 @@
1717
# pylint: disable=too-many-instance-attributes
1818
class Archiver:
1919
"""
20-
Archiver pulls all the necessary files from upstream
20+
Archiver helps handle archive duties: pulls all the necessary files from upstream
2121
and then pushes them to the specified backup location(s)
2222
2323
Args:
@@ -32,41 +32,15 @@ def __init__(self, config: ConfigNode):
3232
# for convenience
3333
self.base_dir = config.base_dir_name
3434
self.archive_dir = self._generate_root_folder(self.base_dir)
35-
self._page_archiver = self._generate_page_archiver()
35+
self._page_archiver = PageArchiver(self.archive_dir, self.config)
3636
self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3}
3737

38-
3938
def get_bookstack_exports(self, page_nodes: Dict[int, Node]):
4039
"""export all page content"""
4140
log.info("Exporting all bookstack page contents")
4241
# get images first if requested
4342
# this is because we may want to manipulate page data with modify_markdown flag
44-
all_image_meta = self._get_page_image_map()
45-
for _, page in page_nodes.items():
46-
page_image_meta = []
47-
if page.id_ in all_image_meta:
48-
page_image_meta = all_image_meta[page.id_]
49-
self._get_page_files(page, page_image_meta)
50-
self._get_page_images(page, page_image_meta)
51-
52-
def _get_page_files(self, page_node: Node, image_meta: List[ImageNode]):
53-
"""pull all bookstack pages into local files/tar"""
54-
log.debug("Exporting bookstack page data")
55-
self._page_archiver.archive_page(page_node, image_meta)
56-
57-
def _get_page_image_map(self) -> Dict[int, ImageNode]:
58-
if not self._page_archiver.export_images:
59-
log.debug("skipping image export based on user input")
60-
return {}
61-
return self._page_archiver.get_image_meta()
62-
63-
def _get_page_images(self, page_node: Node, img_nodes: List[ImageNode]):
64-
if not img_nodes:
65-
log.debug("page has no images to pull")
66-
return
67-
log.debug("Exporting bookstack page images")
68-
self._page_archiver.archive_page_images(page_node.parent.file_path,
69-
page_node.name, img_nodes)
43+
self._page_archiver.archive_pages(page_nodes)
7044

7145
def create_archive(self):
7246
"""create tgz archive"""
@@ -145,10 +119,6 @@ def _delete_files(self, file_list: List[str]):
145119
for file in file_list:
146120
util.remove_file(file)
147121

148-
def _generate_page_archiver(self)-> PageArchiver:
149-
return PageArchiver(self.archive_dir, self.config)
150-
151-
152122
@staticmethod
153123
def _generate_root_folder(base_folder_name: str) -> str:
154124
"""return base archive name"""
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
from typing import Union, List, Dict
2+
# pylint: disable=import-error
3+
from requests import Response
4+
from re import sub as re_sub
5+
import logging
6+
import base64
7+
8+
from bookstack_file_exporter.common import util as common_util
9+
10+
log = logging.getLogger(__name__)
11+
12+
_IMAGE_DIR_NAME = "images"
13+
_ATTACHMENT_DIR_NAME = "attachments"
14+
15+
16+
class AssetNode:
17+
def __init__(self, meta_data: Dict[str, int | str | bool]):
18+
self.id: int = meta_data['id']
19+
self.page_id: int = meta_data['uploaded_to']
20+
# self.page_name: str = page_name
21+
self.url: str = meta_data['url']
22+
self.name: str = self.url.split('/')[-1]
23+
self._markdown_str = ""
24+
self._relative_path_prefix: str = ""
25+
26+
def get_relative_path(self, page_name: str) -> str:
27+
"""image path local to page directory"""
28+
return f"{self._relative_path_prefix}/{page_name}/{self.name}"
29+
30+
@property
31+
def markdown_str(self):
32+
"""return markdown url str to replace"""
33+
return self._markdown_str
34+
35+
def set_markdown_content(self, asset_data: Dict[str, int | str | bool]) -> None:
36+
self._markdown_str = self._get_md_url_str(asset_data)
37+
38+
@staticmethod
39+
def _get_md_url_str(asset_data: Dict[str, Union[int, str]]) -> str:
40+
url_str = ""
41+
if 'content' in asset_data:
42+
if 'markdown' in asset_data['content']:
43+
url_str = asset_data['content']['markdown']
44+
# check to see if empty before doing find
45+
if not url_str:
46+
return ""
47+
# find the link between two parenthesis
48+
# - markdown format
49+
return url_str[url_str.find("(")+1:url_str.find(")")]
50+
51+
class ImageNode(AssetNode):
52+
def __init__(self, meta_data: Dict[str, Union[int, str]]):
53+
super().__init__(meta_data)
54+
log.debug(self.url)
55+
self._relative_path_prefix = f"{_IMAGE_DIR_NAME}"
56+
57+
class AttachmentNode(AssetNode):
58+
def __init__(self, meta_data: Dict[str, Union[int, str, bool]],
59+
base_url: str):
60+
self.id: int = meta_data['id']
61+
self.page_id: int = meta_data['uploaded_to']
62+
self.url: str = f"{base_url}/{self.id}"
63+
log.debug(self.url)
64+
self.name = meta_data['name']
65+
self._markdown_str = ""
66+
self._relative_path_prefix = f"{_ATTACHMENT_DIR_NAME}"
67+
68+
@staticmethod
69+
def _get_md_url_str(asset_data: Dict[str, int | str | dict]) -> str:
70+
url_str = ""
71+
if 'links' in asset_data:
72+
if 'markdown' in asset_data['links']:
73+
url_str = asset_data['links']['markdown']
74+
# check to see if empty before doing find
75+
if not url_str:
76+
return ""
77+
# find the link between two parenthesis
78+
# - markdown format
79+
return url_str[url_str.find("(")+1:url_str.find(")")]
80+
81+
class AssetArchiver:
82+
def __init__(self, urls: Dict[str, str], headers: Dict[str, str],
83+
verify_ssl: bool):
84+
self.api_urls = urls
85+
self.verify_ssl = verify_ssl
86+
self._headers = headers
87+
self._asset_map = {
88+
'images': self._create_image_map,
89+
'attachments': self._create_attachment_map
90+
}
91+
92+
def get_asset_nodes(self, asset_type: str) -> Dict[str, ImageNode | AttachmentNode]:
93+
"""Get image or attachment helpers for a page"""
94+
asset_response: Response = common_util.http_get_request(
95+
self.api_urls[asset_type],
96+
self._headers,
97+
self.verify_ssl)
98+
asset_json = asset_response.json()['data']
99+
return self._asset_map[asset_type](asset_json)
100+
101+
def get_asset_data(self, asset_type: str,
102+
meta_data: Union[AttachmentNode, ImageNode]) -> Dict[str, str | bool | int | dict]:
103+
"""Get asset data based on type"""
104+
data_url = f"{self.api_urls[asset_type]}/{meta_data.id}"
105+
asset_data_response: Response = common_util.http_get_request(
106+
data_url,
107+
self._headers,
108+
self.verify_ssl)
109+
return asset_data_response.json()
110+
111+
def get_asset_bytes(self, asset_type: str, url: str) -> bytes:
112+
"""Get raw asset data"""
113+
asset_response: Response = common_util.http_get_request(
114+
url,
115+
self._headers,
116+
self.verify_ssl)
117+
match asset_type:
118+
case "images":
119+
asset_data = asset_response.content
120+
case "attachments":
121+
asset_data = self.decode_attachment_data(asset_response.json()['content'])
122+
return asset_data
123+
124+
def update_asset_links(self, asset_type, page_name: str, page_data: bytes,
125+
asset_nodes: List[ImageNode | AttachmentNode]) -> bytes:
126+
"""update markdown links in page data"""
127+
for asset_node in asset_nodes:
128+
asset_data = self.get_asset_data(asset_type, asset_node)
129+
asset_node.set_markdown_content(asset_data)
130+
if not asset_node.markdown_str:
131+
continue
132+
page_data = re_sub(asset_node.markdown_str.encode(),
133+
asset_node.get_relative_path(page_name).encode(), page_data)
134+
return page_data
135+
136+
@staticmethod
137+
def _create_image_map(json_data: Dict[str,
138+
List[Dict[str, str | int | bool | dict]]]) -> Dict[int, List[ImageNode]]:
139+
image_page_map = {}
140+
for img_meta in json_data:
141+ img_node = ImageNode(img_meta)
142+
if img_node.page_id in image_page_map:
143+
image_page_map[img_node.page_id].append(img_node)
144+
else:
145+
image_page_map[img_node.page_id] = [img_node]
146+
return image_page_map
147+
148+
def _create_attachment_map(self,
149+
json_data: Dict[str, List[Dict[str, str | int | bool | dict]]]) -> List[AssetNode]:
150+
asset_nodes = {}
151+
for asset_meta in json_data:
152+
asset_node = None
153+
if asset_meta['external']:
154+
continue # skip external link, only get attachments
155+
asset_node = AttachmentNode(asset_meta, self.api_urls['attachments'])
156+
if asset_node.page_id in asset_nodes:
157+
asset_nodes[asset_node.page_id].append(asset_node)
158+
else:
159+
asset_nodes[asset_node.page_id] = [asset_node]
160+
return asset_nodes
161+
162+
@staticmethod
163+
def decode_attachment_data(b64encoded_data: str) -> bytes:
164+
"""decode base64 encoded data"""
165+
asset_data = b64encoded_data.encode()
166+
return base64.b64decode(asset_data)

0 commit comments

Comments
 (0)
0