Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,18 +473,17 @@ def __parse_server_exception(
code = int(server_error["oml:code"])
message = server_error["oml:message"]
additional_information = server_error.get("oml:additional_information")
if code in [372, 512, 500, 482, 542, 674]:
if code in [111, 372, 512, 500, 482, 542, 674]:
if additional_information:
full_message = f"{message} - {additional_information}"
else:
full_message = message

# 512 for runs, 372 for datasets, 500 for flows
# 482 for tasks, 542 for evaluations, 674 for setups
return OpenMLServerNoResult(
code=code,
message=full_message,
)
# 111 for dataset descriptions
return OpenMLServerNoResult(code=code, message=full_message, url=url)

# 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow)
if code in [163] and file_elements is not None and "description" in file_elements:
# file_elements['description'] is the XML file description of the flow
Expand Down
139 changes: 40 additions & 99 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
OpenMLNotAuthorizedError,
OpenMLPrivateDatasetError,
OpenMLServerException,
OpenMLServerNoResult,
)
from openml.tasks import TaskType, create_task
from openml.testing import TestBase, create_request_response
Expand Down Expand Up @@ -274,9 +275,7 @@ def test_get_dataset_cannot_access_private_data(self):
@pytest.mark.skip("Need to find dataset name of private dataset")
def test_dataset_by_name_cannot_access_private_data(self):
openml.config.server = self.production_server
self.assertRaises(
OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE"
)
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")

def test_get_dataset_lazy_all_functions(self):
"""Test that all expected functionality is available without downloading the dataset."""
Expand All @@ -285,9 +284,7 @@ def test_get_dataset_lazy_all_functions(self):

def ensure_absence_of_real_data():
assert not os.path.exists(
os.path.join(
openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"
)
os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
)

tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
Expand Down Expand Up @@ -509,12 +506,8 @@ def test_deletion_of_cache_dir(self):
@mock.patch("openml.datasets.functions._get_dataset_description")
def test_deletion_of_cache_dir_faulty_download(self, patch):
patch.side_effect = Exception("Boom!")
self.assertRaisesRegex(
Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1
)
datasets_cache_dir = os.path.join(
self.workdir, "org", "openml", "test", "datasets"
)
self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
assert len(os.listdir(datasets_cache_dir)) == 0

def test_publish_dataset(self):
Expand Down Expand Up @@ -555,9 +548,7 @@ def test__retrieve_class_labels(self):
# Test workaround for string-typed class labels
custom_ds = openml.datasets.get_dataset(2)
custom_ds.features[31].data_type = "string"
labels = custom_ds.retrieve_class_labels(
target_name=custom_ds.features[31].name
)
labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
assert labels == ["COIL", "SHEET"]

def test_upload_dataset_with_url(self):
Expand Down Expand Up @@ -600,9 +591,7 @@ def test_data_status(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
did = dataset.id

# admin key for test server (only adminds can activate datasets.
Expand Down Expand Up @@ -678,8 +667,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
for arr, dt in zip(data, dtype):
df = pd.DataFrame(arr)
err_msg = (
f"The dtype '{dt}' of the column '0' is not currently "
"supported by liac-arff"
f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
)
with pytest.raises(ValueError, match=err_msg):
attributes_arff_from_df(df)
Expand Down Expand Up @@ -710,16 +698,12 @@ def test_create_dataset_numpy(self):

dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))

assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded arff does not match original one"
assert (
_get_online_dataset_format(dataset.id) == "arff"
), "Wrong format for dataset"
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"

def test_create_dataset_list(self):
data = [
Expand Down Expand Up @@ -769,15 +753,11 @@ def test_create_dataset_list(self):

dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
assert (
_get_online_dataset_format(dataset.id) == "arff"
), "Wrong format for dataset"
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"

def test_create_dataset_sparse(self):
# test the scipy.sparse.coo_matrix
Expand Down Expand Up @@ -974,9 +954,7 @@ def test_create_dataset_pandas(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
Expand All @@ -991,9 +969,7 @@ def test_create_dataset_pandas(self):
column_names = ["input1", "input2", "y"]
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
# meta-information
description = (
"Synthetic dataset created from a Pandas DataFrame with Sparse columns"
)
description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
Expand All @@ -1014,15 +990,11 @@ def test_create_dataset_pandas(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
assert (
_get_online_dataset_format(dataset.id) == "sparse_arff"
), "Wrong format for dataset"
assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"

# Check that we can overwrite the attributes
data = [["a"], ["b"], ["c"], ["d"], ["e"]]
Expand Down Expand Up @@ -1050,13 +1022,9 @@ def test_create_dataset_pandas(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
downloaded_data = _get_online_dataset_arff(dataset.id)
assert (
downloaded_data == dataset._dataset
), "Uploaded ARFF does not match original one"
assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data

def test_ignore_attributes_dataset(self):
Expand Down Expand Up @@ -1217,9 +1185,7 @@ def test_publish_fetch_ignore_attribute(self):
# publish dataset
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
# test if publish was successful
assert isinstance(dataset.id, int)

Expand Down Expand Up @@ -1403,9 +1369,7 @@ def test_get_dataset_cache_format_feather(self):
cache_dir = openml.config.get_cache_directory()
cache_dir_for_id = os.path.join(cache_dir, "datasets", "128")
feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
pickle_file = os.path.join(
cache_dir_for_id, "dataset.feather.attributes.pkl.py3"
)
pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
data = pd.read_feather(feather_file)
assert os.path.isfile(feather_file), "Feather file is missing"
assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
Expand Down Expand Up @@ -1450,19 +1414,15 @@ def test_data_edit_critical_field(self):
# for this, we need to first clone a dataset to do changes
did = fork_dataset(1)
self._wait_for_dataset_being_processed(did)
result = edit_dataset(
did, default_target_attribute="shape", ignore_attribute="oil"
)
result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
assert did == result

n_tries = 10
# we need to wait for the edit to be reflected on the server
for i in range(n_tries):
edited_dataset = openml.datasets.get_dataset(did)
try:
assert (
edited_dataset.default_target_attribute == "shape"
), edited_dataset
assert edited_dataset.default_target_attribute == "shape", edited_dataset
assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
break
except AssertionError as e:
Expand All @@ -1471,9 +1431,7 @@ def test_data_edit_critical_field(self):
time.sleep(10)
# Delete the cache dir to get the newer version of the dataset
shutil.rmtree(
os.path.join(
self.workdir, "org", "openml", "test", "datasets", str(did)
),
os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
)

def test_data_edit_requires_field(self):
Expand Down Expand Up @@ -1564,9 +1522,7 @@ def test_list_datasets_with_high_size_parameter(self):
openml.config.server = self.production_server

datasets_a = openml.datasets.list_datasets(output_format="dataframe")
datasets_b = openml.datasets.list_datasets(
output_format="dataframe", size=np.inf
)
datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf)

# Reverting to test server
openml.config.server = self.test_server
Expand Down Expand Up @@ -1646,9 +1602,7 @@ def test_invalid_attribute_validations(
(None, None, ["outlook", "windy"]),
],
)
def test_valid_attribute_validations(
default_target_attribute, row_id_attribute, ignore_attribute
):
def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
data = [
["a", "sunny", 85.0, 85.0, "FALSE", "no"],
["b", "sunny", 80.0, 90.0, "TRUE", "no"],
Expand Down Expand Up @@ -1749,10 +1703,7 @@ def test_delete_dataset(self):
def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_not_owned.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
Expand All @@ -1774,10 +1725,7 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke
def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_has_tasks.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
Expand All @@ -1799,10 +1747,7 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key
def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_successful.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
)
mock_delete.return_value = create_request_response(
status_code=200,
Expand All @@ -1821,10 +1766,7 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key)
def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_not_exist.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
Expand Down Expand Up @@ -1861,9 +1803,7 @@ def test_list_datasets(all_datasets: pd.DataFrame):


def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
tag_datasets = openml.datasets.list_datasets(
tag="study_14", output_format="dataframe"
)
tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe")
assert 0 < len(tag_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(tag_datasets)

Expand Down Expand Up @@ -2001,15 +1941,16 @@ def test_get_dataset_lazy_behavior(
with_features=with_features,
with_data=with_data,
)
assert (
dataset.features
), "Features should be downloaded on-demand if not during get_dataset"
assert (
dataset.qualities
), "Qualities should be downloaded on-demand if not during get_dataset"
assert (
dataset.get_data()
), "Data should be downloaded on-demand if not during get_dataset"
assert dataset.features, "Features should be downloaded on-demand if not during get_dataset"
assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset"
assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset"
_assert_datasets_retrieved_successfully(
[1], with_qualities=True, with_features=True, with_data=True
)


def test_get_dataset_with_invalid_id() -> None:
INVALID_ID = 123819023109238 # Well, at some point this will probably be valid...
with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
openml.datasets.get_dataset(INVALID_ID)
assert e.value.code == 111