Coverage for mindsdb / api / http / namespaces / file.py: 63%

160 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import os 

2import shutil 

3import tarfile 

4import tempfile 

5import zipfile 

6from pathlib import Path 

7from urllib.parse import urlparse 

8 

9import multipart 

10import requests 

11from flask import current_app as ca 

12from flask import request 

13from flask_restx import Resource 

14 

15from mindsdb.api.http.namespaces.configs.files import ns_conf 

16from mindsdb.api.http.utils import http_error 

17from mindsdb.metrics.metrics import api_endpoint_metrics 

18from mindsdb.utilities.config import config 

19from mindsdb.utilities.context import context as ctx 

20from mindsdb.utilities import log 

21from mindsdb.utilities.security import is_private_url, clear_filename, validate_urls 

22from mindsdb.utilities.fs import safe_extract 

23from mindsdb.integrations.utilities.files.file_reader import FileProcessingError 

24 

25logger = log.getLogger(__name__) 

26MAX_FILE_SIZE = 1024 * 1024 * 100 # 100Mb 

27 

28 

29@ns_conf.route("/") 

30class FilesList(Resource): 

31 @ns_conf.doc("get_files_list") 

32 @api_endpoint_metrics("GET", "/files") 

33 def get(self): 

34 """List all files""" 

35 return ca.file_controller.get_files() 

36 

37 

38@ns_conf.route("/<name>") 

39@ns_conf.param("name", "MindsDB's name for file") 

40class File(Resource): 

41 @ns_conf.doc("put_file") 

42 @api_endpoint_metrics("PUT", "/files/file") 

43 def put(self, name: str): 

44 """add new file as table 

45 

46 The table name is <name> path paramether 

47 

48 Data is provided as json or form data. File can be provided with FormData or via URL. 

49 

50 If file is in FormData, then the form contain: 

51 - source_type (str) - 'file' 

52 - file (binary) - the file itself 

53 - original_file_name (str, optional) - the name with which the file will be saved 

54 

55 If file should be uploaded from URL: 

56 - source_type (str) - 'url' 

57 - source (str) - the URL 

58 - original_file_name (str, optional) - the name with which the file will be saved 

59 """ 

60 

61 data = {} 

62 mindsdb_file_name = name.lower() 

63 

64 def on_field(field): 

65 name = field.field_name.decode() 

66 value = field.value.decode() 

67 data[name] = value 

68 

69 file_object = None 

70 

71 def on_file(file): 

72 nonlocal file_object 

73 file_name = file.file_name.decode() 

74 data["file"] = file_name 

75 if Path(file_name).name != file_name: 

76 raise ValueError(f"Wrong file name: {file_name}") 

77 file_object = file.file_object 

78 

79 temp_dir_path = tempfile.mkdtemp(prefix="mindsdb_file_") 

80 

81 if request.headers["Content-Type"].startswith("multipart/form-data"): 

82 parser = multipart.create_form_parser( 

83 headers=request.headers, 

84 on_field=on_field, 

85 on_file=on_file, 

86 config={ 

87 "UPLOAD_DIR": temp_dir_path.encode(), # bytes required 

88 "UPLOAD_KEEP_FILENAME": False, 

89 "UPLOAD_KEEP_EXTENSIONS": True, 

90 "UPLOAD_DELETE_TMP": False, 

91 "MAX_MEMORY_FILE_SIZE": 0, 

92 }, 

93 ) 

94 

95 while True: 

96 chunk = request.stream.read(8192) 

97 if not chunk: 

98 break 

99 parser.write(chunk) 

100 parser.finalize() 

101 parser.close() 

102 

103 if file_object is not None: 103 ↛ 115line 103 didn't jump to line 115 because the condition on line 103 was always true

104 if not file_object.closed: 104 ↛ 110line 104 didn't jump to line 110 because the condition on line 104 was always true

105 try: 

106 file_object.flush() 

107 except (AttributeError, ValueError, OSError): 

108 logger.debug("Failed to flush file_object before closing.", exc_info=True) 

109 file_object.close() 

110 Path(file_object.name).rename(Path(file_object.name).parent / data["file"]) 

111 file_object = None 

112 else: 

113 data = request.json 

114 

115 existing_file_names = ca.file_controller.get_files_names(lower=True) 

116 if mindsdb_file_name.lower() in existing_file_names: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 return http_error( 

118 400, 

119 "File already exists", 

120 f"File with name '{mindsdb_file_name}' already exists", 

121 ) 

122 

123 source_type = data.get("source_type", "file") 

124 if source_type not in ("file", "url"): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 return http_error( 

126 400, 

127 "Wrong file source type", 

128 f'Only "file" and "url" supported as file source, got "{source_type}"', 

129 ) 

130 

131 if source_type == "url": 

132 url_file_upload_enabled = config["url_file_upload"]["enabled"] 

133 if url_file_upload_enabled is False: 

134 return http_error(400, "URL file upload is disabled.", "URL file upload is disabled.") 

135 

136 if "file" in data: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 return http_error( 

138 400, 

139 "Fields conflict", 

140 'URL source type can not be used together with "file" field.', 

141 ) 

142 if "source" not in data: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 return http_error( 

144 400, 

145 "Missed file source", 

146 'If the file\'s source type is URL, the "source" field should be specified.', 

147 ) 

148 url = data["source"] 

149 try: 

150 url = urlparse(url) 

151 if not (url.scheme and url.netloc): 151 ↛ 153line 151 didn't jump to line 153 because the condition on line 151 was always true

152 raise ValueError() 

153 url = url.geturl() 

154 except Exception: 

155 return http_error( 

156 400, 

157 "Invalid URL", 

158 f"The URL is not valid: {data['source']}", 

159 ) 

160 

161 allowed_origins = config["url_file_upload"]["allowed_origins"] 

162 disallowed_origins = config["url_file_upload"]["disallowed_origins"] 

163 

164 if validate_urls(url, allowed_origins, disallowed_origins) is False: 

165 return http_error( 

166 400, 

167 "Invalid URL", 

168 "URL is not allowed for security reasons. Allowed hosts are: " 

169 f"{', '.join(allowed_origins) if allowed_origins else 'not specified'}.", 

170 ) 

171 

172 data["file"] = clear_filename(mindsdb_file_name) 

173 if config.is_cloud: 

174 if is_private_url(url): 

175 return http_error(400, f"URL is private: {url}") 

176 

177 if ctx.user_class != 1: 

178 info = requests.head(url, timeout=30) 

179 file_size = info.headers.get("Content-Length") 

180 try: 

181 file_size = int(file_size) 

182 except Exception: 

183 pass 

184 

185 if file_size is None: 

186 return http_error( 

187 400, 

188 "Error getting file info", 

189 "Can't determine remote file size", 

190 ) 

191 if file_size > MAX_FILE_SIZE: 

192 return http_error( 

193 400, 

194 "File is too big", 

195 f"Upload limit for file is {MAX_FILE_SIZE >> 20} MB", 

196 ) 

197 with requests.get(url, stream=True) as r: 

198 if r.status_code != 200: 

199 return http_error(400, "Error getting file", f"Got status code: {r.status_code}") 

200 file_path = os.path.join(temp_dir_path, data["file"]) 

201 with open(file_path, "wb") as f: 

202 for chunk in r.iter_content(chunk_size=8192): 

203 f.write(chunk) 

204 

205 if "file" not in data: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 return http_error( 

207 400, 

208 "File field is missed", 

209 'The "field" field is missed in the form', 

210 ) 

211 

212 original_file_name = clear_filename(data.get("original_file_name")) 

213 

214 file_path = os.path.join(temp_dir_path, data["file"]) 

215 lp = file_path.lower() 

216 if lp.endswith((".zip", ".tar.gz")): 

217 if lp.endswith(".zip"): 217 ↛ 220line 217 didn't jump to line 220 because the condition on line 217 was always true

218 with zipfile.ZipFile(file_path) as f: 

219 f.extractall(temp_dir_path) 

220 elif lp.endswith(".tar.gz"): 

221 with tarfile.open(file_path) as f: 

222 safe_extract(f, temp_dir_path) 

223 os.remove(file_path) 

224 files = os.listdir(temp_dir_path) 

225 if len(files) != 1: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 os.rmdir(temp_dir_path) 

227 return http_error(400, "Wrong content.", "Archive must contain only one data file.") 

228 file_path = os.path.join(temp_dir_path, files[0]) 

229 mindsdb_file_name = files[0] 

230 if not os.path.isfile(file_path): 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 os.rmdir(temp_dir_path) 

232 return http_error(400, "Wrong content.", "Archive must contain data file in root.") 

233 

234 try: 

235 if not Path(mindsdb_file_name).suffix == "": 

236 return http_error(400, "Error", "File name cannot contain extension.") 

237 ca.file_controller.save_file(mindsdb_file_name, file_path, file_name=original_file_name) 

238 except FileProcessingError as e: 

239 return http_error(400, "Error", str(e)) 

240 except Exception as e: 

241 return http_error(500, "Error", str(e)) 

242 finally: 

243 shutil.rmtree(temp_dir_path, ignore_errors=True) 

244 

245 return "", 200 

246 

247 @ns_conf.doc("delete_file") 

248 @api_endpoint_metrics("DELETE", "/files/file") 

249 def delete(self, name: str): 

250 """delete file""" 

251 

252 try: 

253 ca.file_controller.delete_file(name) 

254 except FileNotFoundError: 

255 logger.exception(f"Error when deleting file '{name}'") 

256 return http_error( 

257 400, 

258 "Error deleting file", 

259 f"There was an error while trying to delete file with name '{name}'", 

260 ) 

261 except Exception as e: 

262 logger.error(e) 

263 return http_error( 

264 500, 

265 "Error occured while deleting file", 

266 f"There was an error while trying to delete file with name '{name}'", 

267 ) 

268 return "", 200