Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions deepset_cloud_sdk/_api/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI
from deepset_cloud_sdk._api.upload_sessions import WriteMode
from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES
from deepset_cloud_sdk._utils.datetime import from_isoformat

logger = structlog.get_logger(__name__)
Expand Down Expand Up @@ -225,12 +224,6 @@ async def direct_upload_in_memory(
FAIL - fails to upload the file with the same name.
:return: ID of the uploaded file.
"""
file_name_suffix = Path(file_name).suffix
if file_name_suffix not in SUPPORTED_TYPE_SUFFIXES:
raise NotMatchingFileTypeException(
f"File name {file_name} is not a supported file type. Please use one of {'` '.join(SUPPORTED_TYPE_SUFFIXES)} for text uploads."
)

response = await self._deepset_cloud_api.post(
workspace_name,
"files",
Expand Down
67 changes: 13 additions & 54 deletions deepset_cloud_sdk/_service/files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,7 @@
from collections import defaultdict
from contextlib import asynccontextmanager
from pathlib import Path
from typing import (
Any,
AsyncGenerator,
Dict,
List,
Optional,
Sequence,
Set,
Tuple,
Union,
)
from typing import Any, AsyncGenerator, Dict, List, Optional, Sequence, Union
from uuid import UUID

import structlog
Expand All @@ -41,7 +31,6 @@
WriteMode,
)
from deepset_cloud_sdk._s3.upload import S3, S3UploadResult, S3UploadSummary
from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES
from deepset_cloud_sdk.models import DeepsetCloudFileBase

logger = structlog.get_logger(__name__)
Expand Down Expand Up @@ -226,11 +215,7 @@ async def upload_file_paths(
if len(file_paths) <= DIRECT_UPLOAD_THRESHOLD:
logger.info("Uploading files to deepset Cloud.", file_paths=file_paths)
_coroutines = []
_raw_files = [
path
for path in file_paths
if path.suffix in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(META_SUFFIX)
]
_raw_files = [path for path in file_paths if not path.name.endswith(META_SUFFIX)]
for file_path in _raw_files:
meta: Dict[str, Any] = {}
meta_path = Path(str(file_path) + META_SUFFIX)
Expand Down Expand Up @@ -315,12 +300,6 @@ def _validate_file_paths(file_paths: List[Path]) -> None:
:raises ValueError: If the file paths are invalid.
"""
logger.info("Validating file paths and metadata.")
for file_path in file_paths:
if file_path.suffix not in SUPPORTED_TYPE_SUFFIXES:
raise ValueError(
f"Invalid file extension: {file_path.suffix}. Refer to the list of supported file types in `SUPPORTED_TYPE_SUFFIXES`. "
"Metadata files should have the `.meta.json` extension."
)
meta_file_names = list(
map(
lambda fp: os.path.basename(fp),
Expand Down Expand Up @@ -363,45 +342,26 @@ def _remove_duplicates(file_paths: List[Path]) -> List[Path]:

return most_recent_files

@staticmethod
def _get_allowed_file_types(desired_file_types: Optional[List[Any]]) -> List[str]:
"""Filter `SUPPORTED_TYPE_SUFFIXES` by `desired_file_types`.

If desired_file_types is empty, all supported file types are returned.
:param desired_file_types: A list of desired file types.
:return: A list of desired file types that can be processed by deepset Cloud.
"""
if not desired_file_types:
return SUPPORTED_TYPE_SUFFIXES

desired_types_processed: Set[str] = {
str(file_type) if str(file_type).startswith(".") else f".{str(file_type)}"
for file_type in desired_file_types
}
allowed_types: Set[str] = {
file_type for file_type in SUPPORTED_TYPE_SUFFIXES if file_type in desired_types_processed
}

return list(allowed_types)

@staticmethod
def _preprocess_paths(
paths: List[Path],
spinner: yaspin.Spinner = None,
recursive: bool = False,
desired_file_types: Optional[List[str]] = None,
desired_file_types: List[str] | None = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: we could use sth like the following so that you can get rid of all the None checks

class AlwaysContains:
    def __contains__(self, item):
        return True

fake_list = AlwaysContains()

print(42 in fake_list)  # True
print("hello" in fake_list)  # True
print(None in fake_list)  # True

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(it's in my opinion easy to miss the check somewhere)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you mean replacing the native List objects with a class ? I would need to adjust it also within the typer Argument => CLI parsing, but might work.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More replacing the None with an object behaving like a list (basically simulating the null pattern https://refactoring.guru/introduce-null-object

) -> List[Path]:
all_files = FilesService._get_file_paths(paths, recursive=recursive)

allowed_file_types: List[str] = FilesService._get_allowed_file_types(desired_file_types)
allowed_meta_types: Tuple = tuple(f"{file_type}.meta.json" for file_type in allowed_file_types)
file_paths = [path for path in all_files if path.is_file() and not str(path).endswith(META_SUFFIX)]
if desired_file_types is not None:
file_paths = [path for path in file_paths if path.suffix in desired_file_types]

meta_file_path = [path for path in all_files if path.is_file() and str(path).endswith(allowed_meta_types)]
file_paths = [
path
for path in all_files
if path.is_file() and (path.suffix in allowed_file_types and not str(path).endswith(META_SUFFIX))
]
meta_file_path = [path for path in all_files if path.is_file() and str(path).endswith(".meta.json")]
if desired_file_types is not None:
meta_file_path = [
path
for path in meta_file_path
if str(path).endswith(tuple(f"{file_type}.meta.json" for file_type in desired_file_types))
]
combined_paths = meta_file_path + file_paths

combined_paths = FilesService._remove_duplicates(combined_paths)
Expand Down Expand Up @@ -456,7 +416,6 @@ async def upload(
Use this to speed up the upload process and if you are not running concurrent uploads for the same files.
:raises TimeoutError: If blocking is True and the ingestion takes longer than timeout_s.
"""
desired_file_types = desired_file_types or SUPPORTED_TYPE_SUFFIXES
logger.info("Getting valid files from file path. This may take a few minutes.", recursive=recursive)

if show_progress:
Expand Down
1 change: 0 additions & 1 deletion deepset_cloud_sdk/_utils/constants.py

This file was deleted.

1 change: 0 additions & 1 deletion deepset_cloud_sdk/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def upload( # pylint: disable=too-many-arguments
Use this to speed up the upload process. Make sure you are not running concurrent uploads for the same files.
:param safe_mode: If `True`, disables ingesting files in parallel.
"""
use_type = use_type or [".txt", ".pdf"]
sync_upload(
paths=paths,
api_key=api_key,
Expand Down
6 changes: 2 additions & 4 deletions deepset_cloud_sdk/workflows/async_client/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
)
from deepset_cloud_sdk._s3.upload import S3UploadSummary
from deepset_cloud_sdk._service.files_service import FilesService
from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES
from deepset_cloud_sdk.models import DeepsetCloudFile, DeepsetCloudFileBytes


Expand Down Expand Up @@ -147,13 +146,12 @@ async def upload(
:param timeout_s: Timeout in seconds for the upload.
:param show_progress: Shows the upload progress.
:param recursive: Uploads files from subdirectories as well.
:param desired_file_types: A list of allowed file types to upload, defaults to
`[".txt", ".pdf", ".docx", ".pptx", ".xlsx", ".xml", ".csv", ".html", ".md", ".json"]`
:param desired_file_types: A list of allowed file types to upload. If not provided, all
files are uploaded.
:param enable_parallel_processing: If `True`, the deepset Cloud will ingest the files in parallel.
Use this to speed up the upload process and if you are not running concurrent uploads for the same files.
:param safe_mode: If `True`, the deepset Cloud will not ingest the files in parallel.
"""
desired_file_types = desired_file_types or SUPPORTED_TYPE_SUFFIXES
async with FilesService.factory(_get_config(api_key=api_key, api_url=api_url, safe_mode=safe_mode)) as file_service:
return await file_service.upload(
workspace_name=workspace_name,
Expand Down
6 changes: 2 additions & 4 deletions deepset_cloud_sdk/workflows/sync_client/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
WriteMode,
)
from deepset_cloud_sdk._s3.upload import S3UploadSummary
from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES
from deepset_cloud_sdk.models import DeepsetCloudFile, DeepsetCloudFileBytes
from deepset_cloud_sdk.workflows.async_client.files import download as async_download
from deepset_cloud_sdk.workflows.async_client.files import (
Expand Down Expand Up @@ -69,13 +68,12 @@ def upload( # pylint: disable=too-many-arguments
:param timeout_s: Timeout in seconds for the `blocking` parameter.
:param show_progress: Shows the upload progress.
:param recursive: Uploads files from subfolders as well.
:param desired_file_types: A list of allowed file types to upload, defaults to
`[".txt", ".pdf", ".docx", ".pptx", ".xlsx", ".xml", ".csv", ".html", ".md", ".json"]`
:param desired_file_types: A list of allowed file types to upload. If not provided, all
files are uploaded.
:param enable_parallel_processing: If `True`, deepset Cloud ingests files in parallel.
Use this to speed up the upload process. Make sure you are not running concurrent uploads for the same files.
:param safe_mode: If `True`, disables ingesting files in parallel.
"""
desired_file_types = desired_file_types or SUPPORTED_TYPE_SUFFIXES
return asyncio.run(
async_upload(
paths=paths,
Expand Down
10 changes: 3 additions & 7 deletions tests/integration/service/test_integration_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@
from deepset_cloud_sdk._api.config import CommonConfig
from deepset_cloud_sdk._api.files import File
from deepset_cloud_sdk._api.upload_sessions import WriteMode
from deepset_cloud_sdk._service.files_service import (
META_SUFFIX,
SUPPORTED_TYPE_SUFFIXES,
FilesService,
)
from deepset_cloud_sdk._service.files_service import META_SUFFIX, FilesService
from deepset_cloud_sdk.models import DeepsetCloudFile, DeepsetCloudFileBytes


Expand Down Expand Up @@ -68,7 +64,7 @@ async def test_direct_upload_path_multiple_file_types(
blocking=True,
write_mode=WriteMode.KEEP,
timeout_s=timeout,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=[".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"],
)
assert result.total_files == 10
assert result.successful_upload_count == 10
Expand Down Expand Up @@ -159,7 +155,7 @@ async def test_async_upload_multiple_file_types(
blocking=True,
write_mode=WriteMode.KEEP,
timeout_s=timeout,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=[".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"],
)
assert result.total_files == 22
assert result.successful_upload_count == 22
Expand Down
9 changes: 0 additions & 9 deletions tests/unit/api/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,15 +376,6 @@ async def test_direct_upload_with_path_as_string(self, files_api: FilesAPI, mock

@pytest.mark.asyncio
class TestDirectUploadText:
async def test_direct_upload_file_for_wrong_file_type_name(self, files_api: FilesAPI) -> None:
with pytest.raises(NotMatchingFileTypeException):
await files_api.direct_upload_in_memory(
workspace_name="test_workspace",
file_name="basic.xls",
content=b"some text",
meta={},
)

@pytest.mark.parametrize("error_code", [httpx.codes.NOT_FOUND, httpx.codes.SERVICE_UNAVAILABLE])
async def test_direct_upload_file_failed(
self, files_api: FilesAPI, mocked_deepset_cloud_api: Mock, error_code: int
Expand Down
34 changes: 3 additions & 31 deletions tests/unit/service/test_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,7 @@
WriteMode,
)
from deepset_cloud_sdk._s3.upload import S3UploadResult, S3UploadSummary
from deepset_cloud_sdk._service.files_service import (
SUPPORTED_TYPE_SUFFIXES,
FilesService,
)
from deepset_cloud_sdk._service.files_service import FilesService
from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo


Expand Down Expand Up @@ -212,7 +209,7 @@ async def test_upload_paths_to_folder(
paths=[Path("./tests/data/upload_folder")],
blocking=True,
timeout_s=300,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=[".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"],
)
assert mocked_upload_file_paths.called
assert "test_workspace" == mocked_upload_file_paths.call_args[1]["workspace_name"]
Expand Down Expand Up @@ -251,7 +248,7 @@ async def test_upload_paths_to_folder_skips_incompatible_file_and_logs_file_name
paths=[Path("./tests/data/upload_folder")],
blocking=True,
timeout_s=300,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=[".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"],
)
skip_log_line = next((log for log in cap_logs if log.get("event", None) == "Skipping file"), None)
assert skip_log_line is not None
Expand Down Expand Up @@ -1022,9 +1019,6 @@ async def test_validate_file_paths(self, file_paths: List[Path], monkeypatch: Mo
@pytest.mark.parametrize(
"file_paths",
[
[Path("/home/user/.DS_Store")],
[Path("/home/user/file2.jpg")],
[Path("/home/user/file1.exe")],
[Path("/home/user/file1.pdf"), Path("/home/user/file2.pdf.meta.json")],
[Path("/home/user/file1.pdf"), Path("/home/user/file1.txt.meta.json")],
[Path("/home/user/file1.txt"), Path("/home/user/file1.pdf.meta.json")],
Expand Down Expand Up @@ -1109,28 +1103,6 @@ def test_no_spinner_does_not_cause_error(self) -> None:
assert False, f"No error should have been thrown but got error of type '{type(e).__name__}'"


class TestGetAllowedFileTypes:
@pytest.mark.parametrize("input", [[], None])
def test_get_allowed_file_types_empty_values(self, input: List[object] | None) -> None:
file_types = FilesService._get_allowed_file_types(input)
assert file_types == SUPPORTED_TYPE_SUFFIXES

def test_get_allowed_file_types(self) -> None:
desired = [".pdf", ".txt", ".xml"]
file_types = sorted(FilesService._get_allowed_file_types(desired))
assert file_types == desired

def test_get_allowed_file_types_unsupported_types(self) -> None:
desired = [".pdf", ".foo", "jpg", 2]
file_types = sorted(FilesService._get_allowed_file_types(desired))
assert file_types == [".pdf"]

def test_get_allowed_file_types_manages_formatting(self) -> None:
desired = [".pdf", "txt", "xml", "XML", "PDF"]
file_types = sorted(FilesService._get_allowed_file_types(desired))
assert file_types == [".pdf", ".txt", ".xml"]


class TestGetFilePaths:
def test_directories_excluded_from_path_recursive(self) -> None:
paths = [Path("tests/data/upload_folder_nested")]
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_upload_only_desired_file_types_defaults_to_text(self, async_upload_mock
timeout_s=None,
show_progress=True,
recursive=False,
desired_file_types=[".txt", ".pdf"],
desired_file_types=None,
enable_parallel_processing=True,
safe_mode=False,
)
Expand Down Expand Up @@ -132,7 +132,7 @@ def test_upload_safe_mode(self, async_upload_mock: AsyncMock) -> None:
timeout_s=None,
show_progress=True,
recursive=False,
desired_file_types=[".txt", ".pdf"],
desired_file_types=None,
enable_parallel_processing=False,
safe_mode=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
WriteMode,
)
from deepset_cloud_sdk._service.files_service import FilesService
from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES
from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo
from deepset_cloud_sdk.workflows.async_client.files import (
download,
Expand Down Expand Up @@ -69,7 +68,7 @@ async def test_upload(self, monkeypatch: MonkeyPatch) -> None:
timeout_s=None,
show_progress=True,
recursive=False,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=None,
enable_parallel_processing=False,
)

Expand All @@ -87,7 +86,7 @@ async def test_upload_with_timeout(self, monkeypatch: MonkeyPatch) -> None:
timeout_s=123,
show_progress=True,
recursive=False,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=None,
enable_parallel_processing=False,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
UploadSessionWriteModeEnum,
WriteMode,
)
from deepset_cloud_sdk._utils.constants import SUPPORTED_TYPE_SUFFIXES
from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo
from deepset_cloud_sdk.workflows.sync_client.files import (
download,
Expand All @@ -39,7 +38,7 @@ def test_upload_folder(async_upload_mock: AsyncMock) -> None:
timeout_s=None,
show_progress=True,
recursive=False,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=None,
enable_parallel_processing=True,
safe_mode=False,
)
Expand All @@ -58,7 +57,7 @@ def test_upload_folder_safe_mode(async_upload_mock: AsyncMock) -> None:
timeout_s=None,
show_progress=True,
recursive=False,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
desired_file_types=None,
enable_parallel_processing=True,
safe_mode=True,
)
Expand Down
Loading