Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e0af15e
Making some unit tests work
Neeratyoy Nov 10, 2020
14aa11d
Waiting for dataset to be processed
Neeratyoy Nov 16, 2020
31d48d8
Minor test collection fix
Neeratyoy Nov 16, 2020
431447c
Template to handle missing tasks
Neeratyoy Nov 30, 2020
cc3199e
Accounting for more missing tasks:
Neeratyoy Nov 30, 2020
8a29668
Fixing some more unit tests
Neeratyoy Nov 30, 2020
405e03c
Simplifying check_task_existence
Neeratyoy Nov 30, 2020
caf4f46
black changes
Neeratyoy Dec 4, 2020
b308e71
Minor formatting
Neeratyoy Dec 8, 2020
436a9fe
Handling task exists check
Neeratyoy Dec 9, 2020
ddd8b04
Testing edited check task func
Neeratyoy Dec 14, 2020
74ae622
Merge branch 'fix_unit_tests' of https://github.com/openml/openml-pyt…
Neeratyoy Dec 14, 2020
50ce90e
Flake fix
Neeratyoy Dec 15, 2020
56cd639
More retries on connection error
Neeratyoy Dec 16, 2020
8e8ea2e
Adding max_retries to config default
Neeratyoy Dec 17, 2020
d518beb
Update database retry unit test
Neeratyoy Dec 17, 2020
37d9f6b
Print to debug hash exception
Neeratyoy Dec 17, 2020
9bd4892
Fixing checksum unit test
Neeratyoy Dec 17, 2020
dc41b5d
Retry on _download_text_file
Neeratyoy Dec 18, 2020
396cb8d
Update datasets_tutorial.py
mfeurer Dec 21, 2020
8f380de
Update custom_flow_tutorial.py
mfeurer Dec 21, 2020
bc1745e
Update test_study_functions.py
mfeurer Dec 21, 2020
d95b5e6
Update test_dataset_functions.py
mfeurer Dec 21, 2020
91c6cf5
more retries, but also more time between retries
mfeurer Dec 21, 2020
a9430b3
allow for even more retries on get calls
mfeurer Dec 21, 2020
e9cfba8
Catching failed get task
Neeratyoy Dec 21, 2020
c13f6ce
Merge branch 'fix_unit_tests' of https://github.com/openml/openml-pyt…
Neeratyoy Dec 21, 2020
3d7abc2
undo stupid change
mfeurer Dec 21, 2020
94576b1
Merge branch 'fix_unit_tests' of https://github.com/openml/openml-pyt…
Neeratyoy Dec 21, 2020
b5e1242
fix one more test
mfeurer Dec 21, 2020
f5e4a3e
Refactoring md5 hash check inside _send_request
Neeratyoy Dec 21, 2020
07ce722
Fixing a fairly common unit test fail
Neeratyoy Dec 22, 2020
82e1b72
Reverting loose check on unit test
Neeratyoy Dec 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions examples/30_extended/custom_flow_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@
# This allows people to specify auto-sklearn hyperparameters used in this flow.
# In general, using a subflow is not required.
#
# Note: flow 15275 is not actually the right flow on the test server,
# Note: flow 9313 is not actually the right flow on the test server,
# but that does not matter for this demonstration.

autosklearn_flow = openml.flows.get_flow(15275) # auto-sklearn 0.5.1
autosklearn_flow = openml.flows.get_flow(9313) # auto-sklearn 0.5.1
subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),)

####################################################################################################
Expand Down Expand Up @@ -120,7 +120,7 @@
OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
]

task_id = 1408 # Iris Task
task_id = 1965 # Iris Task
task = openml.tasks.get_task(task_id)
dataset_id = task.get_dataset().dataset_id

Expand Down
15 changes: 10 additions & 5 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@

############################################################################
# Edit a created dataset
# =================================================
# ======================
# This example uses the test server, to avoid editing a dataset on the main server.
openml.config.start_using_configuration_for_example()
############################################################################
Expand Down Expand Up @@ -143,18 +143,23 @@
# tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
# configure the API key:
# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
data_id = edit_dataset(564, default_target_attribute="y")
print(f"Edited dataset ID: {data_id}")

# This example here only shows a failure when trying to work on a dataset not owned by you:
try:
data_id = edit_dataset(1, default_target_attribute="shape")
except openml.exceptions.OpenMLServerException as e:
print(e)

############################################################################
# Fork dataset
# ============
# Used to create a copy of the dataset with you as the owner.
# Use this API only if you are unable to edit the critical fields (default_target_attribute,
# ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
# After the dataset is forked, you can edit the new version of the dataset using edit_dataset.

data_id = fork_dataset(564)
data_id = fork_dataset(1)
print(data_id)
data_id = edit_dataset(data_id, default_target_attribute="shape")
print(f"Forked dataset ID: {data_id}")

openml.config.stop_using_configuration_for_example()
72 changes: 43 additions & 29 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import hashlib
import logging
import requests
import xml
import xmltodict
from typing import Dict, Optional

Expand Down Expand Up @@ -105,20 +106,9 @@ def _download_text_file(

logging.info("Starting [%s] request for the URL %s", "get", source)
start = time.time()
response = __read_url(source, request_method="get")
response = __read_url(source, request_method="get", md5_checksum=md5_checksum)
downloaded_file = response.text

if md5_checksum is not None:
md5 = hashlib.md5()
md5.update(downloaded_file.encode("utf-8"))
md5_checksum_download = md5.hexdigest()
if md5_checksum != md5_checksum_download:
raise OpenMLHashException(
"Checksum {} of downloaded file is unequal to the expected checksum {}.".format(
md5_checksum_download, md5_checksum
)
)

if output_path is None:
logging.info(
"%.7fs taken for [%s] request for the URL %s", time.time() - start, "get", source,
Expand Down Expand Up @@ -163,22 +153,33 @@ def _read_url_files(url, data=None, file_elements=None):
return response


def __read_url(url, request_method, data=None):
def __read_url(url, request_method, data=None, md5_checksum=None):
data = {} if data is None else data
if config.apikey is not None:
data["api_key"] = config.apikey
return _send_request(
request_method=request_method, url=url, data=data, md5_checksum=md5_checksum
)


return _send_request(request_method=request_method, url=url, data=data)
def __is_checksum_equal(downloaded_file, md5_checksum=None):
if md5_checksum is None:
return True
md5 = hashlib.md5()
md5.update(downloaded_file.encode("utf-8"))
md5_checksum_download = md5.hexdigest()
if md5_checksum == md5_checksum_download:
return True
return False


def _send_request(
request_method, url, data, files=None,
):
n_retries = config.connection_n_retries
def _send_request(request_method, url, data, files=None, md5_checksum=None):
n_retries = max(1, min(config.connection_n_retries, config.max_retries))

response = None
with requests.Session() as session:
# Start at one to have a non-zero multiplier for the sleep
for i in range(1, n_retries + 1):
for retry_counter in range(1, n_retries + 1):
try:
if request_method == "get":
response = session.get(url, params=data)
Expand All @@ -189,25 +190,36 @@ def _send_request(
else:
raise NotImplementedError()
__check_response(response=response, url=url, file_elements=files)
if request_method == "get" and not __is_checksum_equal(response.text, md5_checksum):
raise OpenMLHashException(
"Checksum of downloaded file is unequal to the expected checksum {} "
"when downloading {}.".format(md5_checksum, url)
)
break
except (
requests.exceptions.ConnectionError,
requests.exceptions.SSLError,
OpenMLServerException,
xml.parsers.expat.ExpatError,
OpenMLHashException,
) as e:
if isinstance(e, OpenMLServerException):
if e.code != 107:
# 107 is a database connection error - only then do retries
if e.code not in [107, 500]:
# 107: database connection error
# 500: internal server error
raise
else:
wait_time = 0.3
else:
wait_time = 0.1
if i == n_retries:
raise e
elif isinstance(e, xml.parsers.expat.ExpatError):
if request_method != "get" or retry_counter >= n_retries:
raise OpenMLServerError(
"Unexpected server error when calling {}. Please contact the "
"developers!\nStatus code: {}\n{}".format(
url, response.status_code, response.text,
)
)
if retry_counter >= n_retries:
raise
else:
time.sleep(wait_time * i)
continue
time.sleep(retry_counter)
if response is None:
raise ValueError("This should never happen!")
return response
Expand All @@ -230,6 +242,8 @@ def __parse_server_exception(
raise OpenMLServerError("URI too long! ({})".format(url))
try:
server_exception = xmltodict.parse(response.text)
except xml.parsers.expat.ExpatError:
raise
except Exception:
# OpenML has a sophisticated error system
# where information about failures is provided. try to parse this
Expand Down
12 changes: 8 additions & 4 deletions openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ def set_file_log_level(file_output_level: int):
"server": "https://www.openml.org/api/v1/xml",
"cachedir": os.path.expanduser(os.path.join("~", ".openml", "cache")),
"avoid_duplicate_runs": "True",
"connection_n_retries": 2,
"connection_n_retries": 10,
"max_retries": 20,
}

config_file = os.path.expanduser(os.path.join("~", ".openml", "config"))
Expand Down Expand Up @@ -116,6 +117,7 @@ def get_server_base_url() -> str:

# Number of retries if the connection breaks
connection_n_retries = _defaults["connection_n_retries"]
max_retries = _defaults["max_retries"]


class ConfigurationForExamples:
Expand Down Expand Up @@ -183,6 +185,7 @@ def _setup():
global cache_directory
global avoid_duplicate_runs
global connection_n_retries
global max_retries

# read config file, create cache directory
try:
Expand All @@ -207,10 +210,11 @@ def _setup():

avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs")
connection_n_retries = config.get("FAKE_SECTION", "connection_n_retries")
if connection_n_retries > 20:
max_retries = config.get("FAKE_SECTION", "max_retries")
if connection_n_retries > max_retries:
raise ValueError(
"A higher number of retries than 20 is not allowed to keep the "
"server load reasonable"
"A higher number of retries than {} is not allowed to keep the "
"server load reasonable".format(max_retries)
)


Expand Down
55 changes: 53 additions & 2 deletions openml/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import shutil
import sys
import time
from typing import Dict
from typing import Dict, Union, cast
import unittest
import warnings
import pandas as pd

# Currently, importing oslo raises a lot of warning that it will stop working
# under python3.8; remove this once they disappear
Expand All @@ -18,6 +19,7 @@

import openml
from openml.tasks import TaskType
from openml.exceptions import OpenMLServerException

import logging

Expand Down Expand Up @@ -252,6 +254,55 @@ def _check_fold_timing_evaluations(
self.assertLessEqual(evaluation, max_val)


def check_task_existence(
task_type: TaskType, dataset_id: int, target_name: str, **kwargs
) -> Union[int, None]:
"""Checks if any task with exists on test server that matches the meta data.

Parameter
---------
task_type : openml.tasks.TaskType
dataset_id : int
target_name : str

Return
------
int, None
"""
return_val = None
tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
if len(tasks) == 0:
return None
tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id]
if len(tasks) == 0:
return None
tasks = tasks.loc[tasks["target_feature"] == target_name]
if len(tasks) == 0:
return None
task_match = []
for task_id in tasks["tid"].to_list():
task_match.append(task_id)
try:
task = openml.tasks.get_task(task_id)
except OpenMLServerException:
# can fail if task_id deleted by another parallely run unit test
task_match.pop(-1)
return_val = None
continue
for k, v in kwargs.items():
if getattr(task, k) != v:
# even if one of the meta-data key mismatches, then task_id is not a match
task_match.pop(-1)
break
# if task_id is retained in the task_match list, it passed all meta key-value matches
if len(task_match) == 1:
return_val = task_id
break
if len(task_match) == 0:
return_val = None
return return_val


try:
from sklearn.impute import SimpleImputer
except ImportError:
Expand All @@ -275,4 +326,4 @@ def cat(X):
return X.dtypes == "category"


__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"]
__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
1 change: 1 addition & 0 deletions openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from functools import wraps
import collections

import openml
import openml._api_calls
import openml.exceptions
from . import config
Expand Down
28 changes: 23 additions & 5 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
DATASETS_CACHE_DIR_NAME,
)
from openml.datasets import fork_dataset, edit_dataset
from openml.tasks import TaskType, create_task


class TestOpenMLDataset(TestBase):
Expand Down Expand Up @@ -414,9 +415,8 @@ def test__getarff_md5_issue(self):
}
self.assertRaisesRegex(
OpenMLHashException,
"Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded file "
"is unequal to the expected checksum abc. "
"Raised when downloading dataset 5.",
"Checksum of downloaded file is unequal to the expected checksum abc when downloading "
"https://www.openml.org/data/download/61. Raised when downloading dataset 5.",
_get_dataset_arff,
description,
)
Expand Down Expand Up @@ -498,6 +498,7 @@ def test_upload_dataset_with_url(self):
)
self.assertIsInstance(dataset.dataset_id, int)

@pytest.mark.flaky()
def test_data_status(self):
dataset = OpenMLDataset(
"%s-UploadTestWithURL" % self._get_sentinel(),
Expand Down Expand Up @@ -1350,7 +1351,7 @@ def test_data_edit_errors(self):
"original_data_url, default_target_attribute, row_id_attribute, "
"ignore_attribute or paper_url to edit.",
edit_dataset,
data_id=564,
data_id=64, # blood-transfusion-service-center
)
# Check server exception when unknown dataset is provided
self.assertRaisesRegex(
Expand All @@ -1360,15 +1361,32 @@ def test_data_edit_errors(self):
data_id=999999,
description="xor operation dataset",
)

# Need to own a dataset to be able to edit meta-data
# Will be creating a forked version of an existing dataset to allow the unit test user
# to edit meta-data of a dataset
did = fork_dataset(1)
self._wait_for_dataset_being_processed(did)
TestBase._mark_entity_for_removal("data", did)
# Need to upload a task attached to this data to test edit failure
task = create_task(
task_type=TaskType.SUPERVISED_CLASSIFICATION,
dataset_id=did,
target_name="class",
estimation_procedure_id=1,
)
task = task.publish()
TestBase._mark_entity_for_removal("task", task.task_id)
# Check server exception when owner/admin edits critical fields of dataset with tasks
self.assertRaisesRegex(
OpenMLServerException,
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
"can only be edited for datasets without any tasks.",
edit_dataset,
data_id=223,
data_id=did,
default_target_attribute="y",
)

# Check server exception when a non-owner or non-admin tries to edit critical fields
self.assertRaisesRegex(
OpenMLServerException,
Expand Down
Loading