|
14 | 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 | 15 | # See the License for the specific language governing permissions and
|
16 | 16 | # limitations under the License.
|
| 17 | +import os |
| 18 | +import re |
| 19 | +import csv |
| 20 | +import uuid |
17 | 21 | import logging
|
| 22 | +import tempfile |
| 23 | +from dataclasses import dataclass |
| 24 | +from pathlib import Path |
| 25 | +from typing import Any, Dict, List, Optional, Tuple, Union |
18 | 26 |
|
19 |
| -from desktop.lib.django_util import render |
| 27 | +import polars as pl |
| 28 | +from rest_framework import status |
| 29 | +from rest_framework.decorators import api_view, parser_classes |
| 30 | +from rest_framework.parsers import JSONParser, MultiPartParser |
| 31 | +from rest_framework.request import Request |
| 32 | +from rest_framework.response import Response |
| 33 | + |
| 34 | +from desktop.lib.importer.serializers import LocalFileUploadSerializer |
20 | 35 |
|
21 | 36 | LOG = logging.getLogger()
|
22 | 37 |
|
23 | 38 |
|
24 |
| -def render_new_importer(request): |
25 |
| - return render('new_importer.mako', request, None) |
| 39 | +# @dataclass |
| 40 | +# class FileFormat: |
| 41 | +# """Data class representing file format configuration""" |
| 42 | + |
| 43 | +# type: str |
| 44 | +# has_header: bool |
| 45 | +# quote_char: str = '"' |
| 46 | +# record_separator: str = '\\n' |
| 47 | +# field_separator: str = ',' |
| 48 | + |
| 49 | + |
| 50 | +# class FormatDetector: |
| 51 | +# """Service class for detecting file formats""" |
| 52 | + |
| 53 | +# SAMPLE_SIZE = 16384 # 16KB for sampling |
| 54 | +# MIN_LINES = 5 |
| 55 | + |
| 56 | +# def __init__(self, content: bytes, filename: str): |
| 57 | +# self.content = content |
| 58 | +# self.filename = filename |
| 59 | +# self._sample = self._get_sample() |
| 60 | + |
| 61 | +# def _get_sample(self) -> str: |
| 62 | +# """Get a sample of file content for format detection""" |
| 63 | +# try: |
| 64 | +# return self.content[: self.SAMPLE_SIZE].decode('utf-8') |
| 65 | +# except UnicodeDecodeError: |
| 66 | +# return self.content[: self.SAMPLE_SIZE].decode('latin-1') |
| 67 | + |
| 68 | +# def detect(self) -> FileFormat: |
| 69 | +# """Detect file format based on content and filename""" |
| 70 | +# if self._is_excel_file(): |
| 71 | +# return self._get_excel_format() |
| 72 | + |
| 73 | +# return self._detect_delimited_format() |
| 74 | + |
| 75 | +# def _is_excel_file(self) -> bool: |
| 76 | +# """Check if file is Excel based on extension""" |
| 77 | +# return self.filename.lower().endswith(('.xlsx', '.xls')) |
| 78 | + |
| 79 | +# def _get_excel_format(self) -> FileFormat: |
| 80 | +# """Return Excel file format configuration""" |
| 81 | +# return FileFormat(type='excel', has_header=True) |
| 82 | + |
| 83 | +# def _detect_delimited_format(self) -> FileFormat: |
| 84 | +# """Detect format for delimited files like CSV""" |
| 85 | +# dialect = self._sniff_csv_dialect() |
| 86 | + |
| 87 | +# return FileFormat( |
| 88 | +# type='csv', |
| 89 | +# has_header=self._detect_header(), |
| 90 | +# quote_char=dialect.quotechar, |
| 91 | +# record_separator='\\n', # Using standard newline |
| 92 | +# field_separator=dialect.delimiter, |
| 93 | +# ) |
| 94 | + |
| 95 | +# def _sniff_csv_dialect(self) -> csv.Dialect: |
| 96 | +# """Detect CSV dialect using csv.Sniffer""" |
| 97 | +# try: |
| 98 | +# return csv.Sniffer().sniff(self._sample) |
| 99 | +# except csv.Error: |
| 100 | +# # Fallback to standard CSV format |
| 101 | +# return csv.excel |
| 102 | + |
| 103 | +# def _detect_header(self) -> bool: |
| 104 | +# """Detect if file has headers""" |
| 105 | +# try: |
| 106 | +# return csv.Sniffer().has_header(self._sample) |
| 107 | +# except csv.Error: |
| 108 | +# # Default to True if detection fails |
| 109 | +# return True |
| 110 | + |
| 111 | + |
| 112 | +# @api_view(['POST']) |
| 113 | +# @parser_classes([JSONParser, MultiPartParser]) |
| 114 | +# def detect_format(request: Request) -> Response: |
| 115 | +# """ |
| 116 | +# Detects and returns the format configuration for input files/data sources. |
| 117 | + |
| 118 | +# Args: |
| 119 | +# request: REST framework Request object containing either: |
| 120 | +# - fileFormat: Dict with file details for HDFS files |
| 121 | +# - file: Uploaded file for local files |
| 122 | + |
| 123 | +# Returns: |
| 124 | +# Response with format configuration: |
| 125 | +# - type: Detected format type (csv, excel) |
| 126 | +# - hasHeader: Boolean indicating header presence |
| 127 | +# - fieldSeparator: Field delimiter for CSV |
| 128 | +# - recordSeparator: Record separator |
| 129 | +# - quoteChar: Quote character |
| 130 | +# - status: Operation status code |
| 131 | + |
| 132 | +# Raises: |
| 133 | +# 400: Bad Request if file format/content cannot be processed |
| 134 | +# """ |
| 135 | +# try: |
| 136 | +# if 'fileFormat' in request.data: |
| 137 | +# return _handle_hdfs_file(request) |
| 138 | +# elif 'file' in request.FILES: |
| 139 | +# return _handle_uploaded_file(request.FILES['file']) |
| 140 | +# else: |
| 141 | +# return Response({'error': 'No file or file format provided'}, status=status.HTTP_400_BAD_REQUEST) |
| 142 | +# except Exception as e: |
| 143 | +# return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) |
| 144 | + |
| 145 | + |
| 146 | +# def _handle_uploaded_file(file) -> Response: |
| 147 | +# """Handle format detection for uploaded files""" |
| 148 | +# detector = FormatDetector(content=file.read(), filename=file.name) |
| 149 | +# file_format = detector.detect() |
| 150 | + |
| 151 | +# return Response( |
| 152 | +# { |
| 153 | +# 'type': file_format.type, |
| 154 | +# 'hasHeader': file_format.has_header, |
| 155 | +# 'quoteChar': file_format.quote_char, |
| 156 | +# 'recordSeparator': file_format.record_separator, |
| 157 | +# 'fieldSeparator': file_format.field_separator, |
| 158 | +# 'status': 0, |
| 159 | +# } |
| 160 | +# ) |
| 161 | + |
| 162 | + |
| 163 | +# def _handle_hdfs_file(request: Request) -> Response: |
| 164 | +# """Handle format detection for HDFS files""" |
| 165 | +# file_format = request.data.get('fileFormat', {}) |
| 166 | +# path = file_format.get('path') |
| 167 | + |
| 168 | +# if not path: |
| 169 | +# return Response({'error': 'No path provided'}, status=status.HTTP_400_BAD_REQUEST) |
| 170 | + |
| 171 | +# if not request.fs.isfile(path): |
| 172 | +# return Response({'error': f'Path {path} is not a file'}, status=status.HTTP_400_BAD_REQUEST) |
| 173 | + |
| 174 | +# with request.fs.open(path) as stream: |
| 175 | +# detector = FormatDetector(content=stream.read(FormatDetector.SAMPLE_SIZE), filename=path) |
| 176 | +# file_format = detector.detect() |
| 177 | + |
| 178 | +# return Response( |
| 179 | +# { |
| 180 | +# 'type': file_format.type, |
| 181 | +# 'hasHeader': file_format.has_header, |
| 182 | +# 'quoteChar': file_format.quote_char, |
| 183 | +# 'recordSeparator': file_format.record_separator, |
| 184 | +# 'fieldSeparator': file_format.field_separator, |
| 185 | +# 'status': 0, |
| 186 | +# } |
| 187 | +# ) |
| 188 | + |
| 189 | + |
| 190 | +@api_view(['POST']) |
| 191 | +@parser_classes([MultiPartParser]) |
| 192 | +def upload_local_file(request: Request) -> Response: |
| 193 | + """ |
| 194 | + Upload and process a CSV or Excel file, converting it to CSV format if needed. |
| 195 | +
|
| 196 | + Returns the stored file path and metadata. |
| 197 | + """ |
| 198 | + # Validate the request data using the serializer |
| 199 | + serializer = LocalFileUploadSerializer(data=request.data) |
| 200 | + |
| 201 | + if not serializer.is_valid(): |
| 202 | + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) |
| 203 | + |
| 204 | + try: |
| 205 | + upload_file = serializer.validated_data['file'] |
| 206 | + file_extension = Path(upload_file.name).suffix.lower()[1:] |
| 207 | + |
| 208 | + # Generate a unique filename |
| 209 | + username = request.user.username |
| 210 | + safe_original_name = re.sub(r'[^0-9a-zA-Z]+', '_', upload_file.name) |
| 211 | + unique_id = uuid.uuid4().hex[:8] |
| 212 | + |
| 213 | + filename = f"{username}_{unique_id}_{safe_original_name}" |
| 214 | + |
| 215 | + # Process the file based on its type |
| 216 | + result = process_uploaded_file(upload_file, filename, file_extension) |
| 217 | + |
| 218 | + return Response(result, status=status.HTTP_201_CREATED) |
| 219 | + |
| 220 | + except Exception as e: |
| 221 | + return Response({"error": f"Error processing file: {str(e)}"}, status=status.HTTP_500_INTERNAL_SERVER_ERROR) |
| 222 | + |
| 223 | + |
| 224 | +def process_uploaded_file(upload_file, filename: str, file_extension: str) -> Dict[str, Any]: |
| 225 | + """ |
| 226 | + Process the uploaded file and convert to CSV if needed. |
| 227 | +
|
| 228 | + Args: |
| 229 | + upload_file: The uploaded file object |
| 230 | + filename: The base filename to use |
| 231 | + file_extension: The file extension (csv, xlsx, xls) |
| 232 | +
|
| 233 | + Returns: |
| 234 | + Dict containing file metadata |
| 235 | + """ |
| 236 | + file_type = 'csv' if file_extension == 'csv' else 'excel' |
| 237 | + |
| 238 | + # Create a temporary file with our generated filename |
| 239 | + temp_dir = tempfile.gettempdir() |
| 240 | + output_path = os.path.join(temp_dir, f"{filename}.csv") |
| 241 | + |
| 242 | + try: |
| 243 | + if file_extension == 'csv': |
| 244 | + df = pl.read_csv(upload_file.read()) |
| 245 | + else: |
| 246 | + # For Excel files, use Polars and its default Calamine engine to read and convert to CSV. |
| 247 | + # TODO: Currently reads the first sheet. Check if we need to support multiple sheets or specific sheets as input. |
| 248 | + df = pl.read_excel(upload_file.read()) |
| 249 | + |
| 250 | + df.write_csv(output_path) |
| 251 | + |
| 252 | + # Return metadata about the processed file |
| 253 | + file_stats = os.stat(output_path) |
| 254 | + |
| 255 | + # TODO: Verify response fields |
| 256 | + return { |
| 257 | + 'filename': os.path.basename(output_path), |
| 258 | + 'file_path': output_path, |
| 259 | + 'row_count': len(df), |
| 260 | + 'column_count': len(df.columns), |
| 261 | + 'file_size_bytes': file_stats.st_size, |
| 262 | + # 'file_type': file_type, |
| 263 | + } |
| 264 | + |
| 265 | + except Exception as e: |
| 266 | + # Clean up the file if there was an error |
| 267 | + if os.path.exists(output_path): |
| 268 | + os.remove(output_path) |
| 269 | + raise e |
0 commit comments