Skip to content

Commit 49f1b25

Browse files
committed
[importer] Implement file upload API for CSV and Excel formats with validation
1 parent a3ba8dc commit 49f1b25

File tree

6 files changed

+487
-203
lines changed

6 files changed

+487
-203
lines changed

desktop/core/base_requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Mako==1.2.3
4242
Markdown==3.7
4343
openpyxl==3.0.9
4444
phoenixdb==1.2.1
45+
polars[calamine]==1.8.2 # Python >= 3.8
4546
prompt-toolkit==3.0.39
4647
protobuf==3.20.3
4748
pyarrow==17.0.0

desktop/core/src/desktop/api_public.py

-5
Original file line numberDiff line numberDiff line change
@@ -432,11 +432,6 @@ def taskserver_get_available_space_for_upload(request):
432432

433433
# Importer
434434

435-
@api_view(["GET"])
436-
def render_new_importer(request):
437-
django_request = get_django_request(request)
438-
return importer_api.render_new_importer(django_request)
439-
440435

441436
@api_view(["POST"])
442437
def guess_format(request):

desktop/core/src/desktop/api_public_urls_v1.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from desktop import api_public
2121
from desktop.lib.botserver import api as botserver_api
22+
from desktop.lib.importer import api as importer_api
2223

2324
# "New" query API (i.e. connector based, lean arguments).
2425
# e.g. https://demo.gethue.com/api/query/execute/hive
@@ -157,7 +158,7 @@
157158
]
158159

159160
urlpatterns += [
160-
re_path(r'^importer/new/?$', api_public.render_new_importer, name='importer_render_new_component'),
161+
re_path(r'^importer/upload/file', importer_api.upload_local_file, name='importer_upload_local_file'),
161162
]
162163

163164
urlpatterns += [

desktop/core/src/desktop/lib/importer/api.py

+247-3
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,256 @@
1414
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
17+
import os
18+
import re
19+
import csv
20+
import uuid
1721
import logging
22+
import tempfile
23+
from dataclasses import dataclass
24+
from pathlib import Path
25+
from typing import Any, Dict, List, Optional, Tuple, Union
1826

19-
from desktop.lib.django_util import render
27+
import polars as pl
28+
from rest_framework import status
29+
from rest_framework.decorators import api_view, parser_classes
30+
from rest_framework.parsers import JSONParser, MultiPartParser
31+
from rest_framework.request import Request
32+
from rest_framework.response import Response
33+
34+
from desktop.lib.importer.serializers import LocalFileUploadSerializer
2035

2136
LOG = logging.getLogger()
2237

2338

24-
def render_new_importer(request):
25-
return render('new_importer.mako', request, None)
39+
# @dataclass
40+
# class FileFormat:
41+
# """Data class representing file format configuration"""
42+
43+
# type: str
44+
# has_header: bool
45+
# quote_char: str = '"'
46+
# record_separator: str = '\\n'
47+
# field_separator: str = ','
48+
49+
50+
# class FormatDetector:
51+
# """Service class for detecting file formats"""
52+
53+
# SAMPLE_SIZE = 16384 # 16KB for sampling
54+
# MIN_LINES = 5
55+
56+
# def __init__(self, content: bytes, filename: str):
57+
# self.content = content
58+
# self.filename = filename
59+
# self._sample = self._get_sample()
60+
61+
# def _get_sample(self) -> str:
62+
# """Get a sample of file content for format detection"""
63+
# try:
64+
# return self.content[: self.SAMPLE_SIZE].decode('utf-8')
65+
# except UnicodeDecodeError:
66+
# return self.content[: self.SAMPLE_SIZE].decode('latin-1')
67+
68+
# def detect(self) -> FileFormat:
69+
# """Detect file format based on content and filename"""
70+
# if self._is_excel_file():
71+
# return self._get_excel_format()
72+
73+
# return self._detect_delimited_format()
74+
75+
# def _is_excel_file(self) -> bool:
76+
# """Check if file is Excel based on extension"""
77+
# return self.filename.lower().endswith(('.xlsx', '.xls'))
78+
79+
# def _get_excel_format(self) -> FileFormat:
80+
# """Return Excel file format configuration"""
81+
# return FileFormat(type='excel', has_header=True)
82+
83+
# def _detect_delimited_format(self) -> FileFormat:
84+
# """Detect format for delimited files like CSV"""
85+
# dialect = self._sniff_csv_dialect()
86+
87+
# return FileFormat(
88+
# type='csv',
89+
# has_header=self._detect_header(),
90+
# quote_char=dialect.quotechar,
91+
# record_separator='\\n', # Using standard newline
92+
# field_separator=dialect.delimiter,
93+
# )
94+
95+
# def _sniff_csv_dialect(self) -> csv.Dialect:
96+
# """Detect CSV dialect using csv.Sniffer"""
97+
# try:
98+
# return csv.Sniffer().sniff(self._sample)
99+
# except csv.Error:
100+
# # Fallback to standard CSV format
101+
# return csv.excel
102+
103+
# def _detect_header(self) -> bool:
104+
# """Detect if file has headers"""
105+
# try:
106+
# return csv.Sniffer().has_header(self._sample)
107+
# except csv.Error:
108+
# # Default to True if detection fails
109+
# return True
110+
111+
112+
# @api_view(['POST'])
113+
# @parser_classes([JSONParser, MultiPartParser])
114+
# def detect_format(request: Request) -> Response:
115+
# """
116+
# Detects and returns the format configuration for input files/data sources.
117+
118+
# Args:
119+
# request: REST framework Request object containing either:
120+
# - fileFormat: Dict with file details for HDFS files
121+
# - file: Uploaded file for local files
122+
123+
# Returns:
124+
# Response with format configuration:
125+
# - type: Detected format type (csv, excel)
126+
# - hasHeader: Boolean indicating header presence
127+
# - fieldSeparator: Field delimiter for CSV
128+
# - recordSeparator: Record separator
129+
# - quoteChar: Quote character
130+
# - status: Operation status code
131+
132+
# Raises:
133+
# 400: Bad Request if file format/content cannot be processed
134+
# """
135+
# try:
136+
# if 'fileFormat' in request.data:
137+
# return _handle_hdfs_file(request)
138+
# elif 'file' in request.FILES:
139+
# return _handle_uploaded_file(request.FILES['file'])
140+
# else:
141+
# return Response({'error': 'No file or file format provided'}, status=status.HTTP_400_BAD_REQUEST)
142+
# except Exception as e:
143+
# return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST)
144+
145+
146+
# def _handle_uploaded_file(file) -> Response:
147+
# """Handle format detection for uploaded files"""
148+
# detector = FormatDetector(content=file.read(), filename=file.name)
149+
# file_format = detector.detect()
150+
151+
# return Response(
152+
# {
153+
# 'type': file_format.type,
154+
# 'hasHeader': file_format.has_header,
155+
# 'quoteChar': file_format.quote_char,
156+
# 'recordSeparator': file_format.record_separator,
157+
# 'fieldSeparator': file_format.field_separator,
158+
# 'status': 0,
159+
# }
160+
# )
161+
162+
163+
# def _handle_hdfs_file(request: Request) -> Response:
164+
# """Handle format detection for HDFS files"""
165+
# file_format = request.data.get('fileFormat', {})
166+
# path = file_format.get('path')
167+
168+
# if not path:
169+
# return Response({'error': 'No path provided'}, status=status.HTTP_400_BAD_REQUEST)
170+
171+
# if not request.fs.isfile(path):
172+
# return Response({'error': f'Path {path} is not a file'}, status=status.HTTP_400_BAD_REQUEST)
173+
174+
# with request.fs.open(path) as stream:
175+
# detector = FormatDetector(content=stream.read(FormatDetector.SAMPLE_SIZE), filename=path)
176+
# file_format = detector.detect()
177+
178+
# return Response(
179+
# {
180+
# 'type': file_format.type,
181+
# 'hasHeader': file_format.has_header,
182+
# 'quoteChar': file_format.quote_char,
183+
# 'recordSeparator': file_format.record_separator,
184+
# 'fieldSeparator': file_format.field_separator,
185+
# 'status': 0,
186+
# }
187+
# )
188+
189+
190+
@api_view(['POST'])
191+
@parser_classes([MultiPartParser])
192+
def upload_local_file(request: Request) -> Response:
193+
"""
194+
Upload and process a CSV or Excel file, converting it to CSV format if needed.
195+
196+
Returns the stored file path and metadata.
197+
"""
198+
# Validate the request data using the serializer
199+
serializer = LocalFileUploadSerializer(data=request.data)
200+
201+
if not serializer.is_valid():
202+
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
203+
204+
try:
205+
upload_file = serializer.validated_data['file']
206+
file_extension = Path(upload_file.name).suffix.lower()[1:]
207+
208+
# Generate a unique filename
209+
username = request.user.username
210+
safe_original_name = re.sub(r'[^0-9a-zA-Z]+', '_', upload_file.name)
211+
unique_id = uuid.uuid4().hex[:8]
212+
213+
filename = f"{username}_{unique_id}_{safe_original_name}"
214+
215+
# Process the file based on its type
216+
result = process_uploaded_file(upload_file, filename, file_extension)
217+
218+
return Response(result, status=status.HTTP_201_CREATED)
219+
220+
except Exception as e:
221+
return Response({"error": f"Error processing file: {str(e)}"}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
222+
223+
224+
def process_uploaded_file(upload_file, filename: str, file_extension: str) -> Dict[str, Any]:
225+
"""
226+
Process the uploaded file and convert to CSV if needed.
227+
228+
Args:
229+
upload_file: The uploaded file object
230+
filename: The base filename to use
231+
file_extension: The file extension (csv, xlsx, xls)
232+
233+
Returns:
234+
Dict containing file metadata
235+
"""
236+
file_type = 'csv' if file_extension == 'csv' else 'excel'
237+
238+
# Create a temporary file with our generated filename
239+
temp_dir = tempfile.gettempdir()
240+
output_path = os.path.join(temp_dir, f"{filename}.csv")
241+
242+
try:
243+
if file_extension == 'csv':
244+
df = pl.read_csv(upload_file.read())
245+
else:
246+
# For Excel files, use Polars and its default Calamine engine to read and convert to CSV.
247+
# TODO: Currently reads the first sheet. Check if we need to support multiple sheets or specific sheets as input.
248+
df = pl.read_excel(upload_file.read())
249+
250+
df.write_csv(output_path)
251+
252+
# Return metadata about the processed file
253+
file_stats = os.stat(output_path)
254+
255+
# TODO: Verify response fields
256+
return {
257+
'filename': os.path.basename(output_path),
258+
'file_path': output_path,
259+
'row_count': len(df),
260+
'column_count': len(df.columns),
261+
'file_size_bytes': file_stats.st_size,
262+
# 'file_type': file_type,
263+
}
264+
265+
except Exception as e:
266+
# Clean up the file if there was an error
267+
if os.path.exists(output_path):
268+
os.remove(output_path)
269+
raise e
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/env python
2+
# Licensed to Cloudera, Inc. under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. Cloudera, Inc. licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
from rest_framework import serializers
18+
19+
20+
class LocalFileUploadSerializer(serializers.Serializer):
21+
"""Serializer for file upload validation.
22+
23+
This serializer validates that the uploaded file is present and has an
24+
acceptable file format and size.
25+
26+
Attributes:
27+
file: File field that must be included in the request
28+
"""
29+
30+
file = serializers.FileField(required=True, help_text="CSV or Excel file to upload and process")
31+
32+
def validate_file(self, value):
33+
# Add file format validation
34+
extension = value.name.split('.')[-1].lower()
35+
if extension not in ['csv', 'xlsx', 'xls']:
36+
raise serializers.ValidationError("Unsupported file format. Please upload a CSV or Excel file.")
37+
38+
# TODO: Check upper limit for file size
39+
# Add file size validation (e.g., limit to 150 MiB)
40+
if value.size > 150 * 1024 * 1024: # 150 MiB in bytes
41+
raise serializers.ValidationError("File too large. Maximum file size is 150 MiB.")
42+
43+
return value

0 commit comments

Comments
 (0)