openml · PGijsbers · Dec 24, 2020 · Nov 10, 2020 · Nov 16, 2020 · Nov 16, 2020
diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_tutorial.py
@@ -82,10 +82,10 @@
 # This allows people to specify auto-sklearn hyperparameters used in this flow.
 # In general, using a subflow is not required.
 #
-# Note: flow 15275 is not actually the right flow on the test server,
+# Note: flow 9313 is not actually the right flow on the test server,
 # but that does not matter for this demonstration.
 
-autosklearn_flow = openml.flows.get_flow(15275)  # auto-sklearn 0.5.1
+autosklearn_flow = openml.flows.get_flow(9313)  # auto-sklearn 0.5.1
 subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),)
 
 ####################################################################################################
@@ -120,7 +120,7 @@
     OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
 ]
 
-task_id = 1408  # Iris Task
+task_id = 1965  # Iris Task
 task = openml.tasks.get_task(task_id)
 dataset_id = task.get_dataset().dataset_id
 

diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -112,7 +112,7 @@
 
 ############################################################################
 # Edit a created dataset
-# =================================================
+# ======================
 # This example uses the test server, to avoid editing a dataset on the main server.
 openml.config.start_using_configuration_for_example()
 ############################################################################
@@ -143,18 +143,23 @@
 # tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
 # configure the API key:
 # openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
-data_id = edit_dataset(564, default_target_attribute="y")
-print(f"Edited dataset ID: {data_id}")
-
+# This example here only shows a failure when trying to work on a dataset not owned by you:
+try:
+    data_id = edit_dataset(1, default_target_attribute="shape")
+except openml.exceptions.OpenMLServerException as e:
+    print(e)
 
 ############################################################################
 # Fork dataset
+# ============
 # Used to create a copy of the dataset with you as the owner.
 # Use this API only if you are unable to edit the critical fields (default_target_attribute,
 # ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
 # After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
 
-data_id = fork_dataset(564)
+data_id = fork_dataset(1)
+print(data_id)
+data_id = edit_dataset(data_id, default_target_attribute="shape")
 print(f"Forked dataset ID: {data_id}")
 
 openml.config.stop_using_configuration_for_example()
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -4,6 +4,7 @@
 import hashlib
 import logging
 import requests
+import xml
 import xmltodict
 from typing import Dict, Optional
 
@@ -105,20 +106,9 @@ def _download_text_file(
 
     logging.info("Starting [%s] request for the URL %s", "get", source)
     start = time.time()
-    response = __read_url(source, request_method="get")
+    response = __read_url(source, request_method="get", md5_checksum=md5_checksum)
     downloaded_file = response.text
 
-    if md5_checksum is not None:
-        md5 = hashlib.md5()
-        md5.update(downloaded_file.encode("utf-8"))
-        md5_checksum_download = md5.hexdigest()
-        if md5_checksum != md5_checksum_download:
-            raise OpenMLHashException(
-                "Checksum {} of downloaded file is unequal to the expected checksum {}.".format(
-                    md5_checksum_download, md5_checksum
-                )
-            )
-
     if output_path is None:
         logging.info(
             "%.7fs taken for [%s] request for the URL %s", time.time() - start, "get", source,
@@ -163,22 +153,33 @@ def _read_url_files(url, data=None, file_elements=None):
     return response
 
 
-def __read_url(url, request_method, data=None):
+def __read_url(url, request_method, data=None, md5_checksum=None):
     data = {} if data is None else data
     if config.apikey is not None:
         data["api_key"] = config.apikey
+    return _send_request(
+        request_method=request_method, url=url, data=data, md5_checksum=md5_checksum
+    )
+
 
-    return _send_request(request_method=request_method, url=url, data=data)
+def __is_checksum_equal(downloaded_file, md5_checksum=None):
+    if md5_checksum is None:
+        return True
+    md5 = hashlib.md5()
+    md5.update(downloaded_file.encode("utf-8"))
+    md5_checksum_download = md5.hexdigest()
+    if md5_checksum == md5_checksum_download:
+        return True
+    return False
 
 
-def _send_request(
-    request_method, url, data, files=None,
-):
-    n_retries = config.connection_n_retries
+def _send_request(request_method, url, data, files=None, md5_checksum=None):
+    n_retries = max(1, min(config.connection_n_retries, config.max_retries))
+
     response = None
     with requests.Session() as session:
         # Start at one to have a non-zero multiplier for the sleep
-        for i in range(1, n_retries + 1):
+        for retry_counter in range(1, n_retries + 1):
             try:
                 if request_method == "get":
                     response = session.get(url, params=data)
@@ -189,25 +190,36 @@ def _send_request(
                 else:
                     raise NotImplementedError()
                 __check_response(response=response, url=url, file_elements=files)
+                if request_method == "get" and not __is_checksum_equal(response.text, md5_checksum):
+                    raise OpenMLHashException(
+                        "Checksum of downloaded file is unequal to the expected checksum {} "
+                        "when downloading {}.".format(md5_checksum, url)
+                    )
                 break
             except (
                 requests.exceptions.ConnectionError,
                 requests.exceptions.SSLError,
                 OpenMLServerException,
+                xml.parsers.expat.ExpatError,
+                OpenMLHashException,
             ) as e:
                 if isinstance(e, OpenMLServerException):
-                    if e.code != 107:
-                        # 107 is a database connection error - only then do retries
+                    if e.code not in [107, 500]:
+                        # 107: database connection error
+                        # 500: internal server error
                         raise
-                    else:
-                        wait_time = 0.3
-                else:
-                    wait_time = 0.1
-                if i == n_retries:
-                    raise e
+                elif isinstance(e, xml.parsers.expat.ExpatError):
+                    if request_method != "get" or retry_counter >= n_retries:
+                        raise OpenMLServerError(
+                            "Unexpected server error when calling {}. Please contact the "
+                            "developers!\nStatus code: {}\n{}".format(
+                                url, response.status_code, response.text,
+                            )
+                        )
+                if retry_counter >= n_retries:
+                    raise
                 else:
-                    time.sleep(wait_time * i)
-                    continue
+                    time.sleep(retry_counter)
     if response is None:
         raise ValueError("This should never happen!")
     return response
@@ -230,6 +242,8 @@ def __parse_server_exception(
         raise OpenMLServerError("URI too long! ({})".format(url))
     try:
         server_exception = xmltodict.parse(response.text)
+    except xml.parsers.expat.ExpatError:
+        raise
     except Exception:
         # OpenML has a sophisticated error system
         # where information about failures is provided. try to parse this

diff --git a/openml/config.py b/openml/config.py
@@ -87,7 +87,8 @@ def set_file_log_level(file_output_level: int):
     "server": "https://www.openml.org/api/v1/xml",
     "cachedir": os.path.expanduser(os.path.join("~", ".openml", "cache")),
     "avoid_duplicate_runs": "True",
-    "connection_n_retries": 2,
+    "connection_n_retries": 10,
+    "max_retries": 20,
 }
 
 config_file = os.path.expanduser(os.path.join("~", ".openml", "config"))
@@ -116,6 +117,7 @@ def get_server_base_url() -> str:
 
 # Number of retries if the connection breaks
 connection_n_retries = _defaults["connection_n_retries"]
+max_retries = _defaults["max_retries"]
 
 
 class ConfigurationForExamples:
@@ -183,6 +185,7 @@ def _setup():
     global cache_directory
     global avoid_duplicate_runs
     global connection_n_retries
+    global max_retries
 
     # read config file, create cache directory
     try:
@@ -207,10 +210,11 @@ def _setup():
 
     avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs")
     connection_n_retries = config.get("FAKE_SECTION", "connection_n_retries")
-    if connection_n_retries > 20:
+    max_retries = config.get("FAKE_SECTION", "max_retries")
+    if connection_n_retries > max_retries:
         raise ValueError(
-            "A higher number of retries than 20 is not allowed to keep the "
-            "server load reasonable"
+            "A higher number of retries than {} is not allowed to keep the "
+            "server load reasonable".format(max_retries)
         )
 
 

diff --git a/openml/testing.py b/openml/testing.py
@@ -6,9 +6,10 @@
 import shutil
 import sys
 import time
-from typing import Dict
+from typing import Dict, Union, cast
 import unittest
 import warnings
+import pandas as pd
 
 # Currently, importing oslo raises a lot of warning that it will stop working
 # under python3.8; remove this once they disappear
@@ -18,6 +19,7 @@
 
 import openml
 from openml.tasks import TaskType
+from openml.exceptions import OpenMLServerException
 
 import logging
 
@@ -252,6 +254,55 @@ def _check_fold_timing_evaluations(
                         self.assertLessEqual(evaluation, max_val)
 
 
+def check_task_existence(
+    task_type: TaskType, dataset_id: int, target_name: str, **kwargs
+) -> Union[int, None]:
+    """Checks if any task with exists on test server that matches the meta data.
+
+    Parameter
+    ---------
+    task_type : openml.tasks.TaskType
+    dataset_id : int
+    target_name : str
+
+    Return
+    ------
+    int, None
+    """
+    return_val = None
+    tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
+    if len(tasks) == 0:
+        return None
+    tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id]
+    if len(tasks) == 0:
+        return None
+    tasks = tasks.loc[tasks["target_feature"] == target_name]
+    if len(tasks) == 0:
+        return None
+    task_match = []
+    for task_id in tasks["tid"].to_list():
+        task_match.append(task_id)
+        try:
+            task = openml.tasks.get_task(task_id)
+        except OpenMLServerException:
+            # can fail if task_id deleted by another parallely run unit test
+            task_match.pop(-1)
+            return_val = None
+            continue
+        for k, v in kwargs.items():
+            if getattr(task, k) != v:
+                # even if one of the meta-data key mismatches, then task_id is not a match
+                task_match.pop(-1)
+                break
+        # if task_id is retained in the task_match list, it passed all meta key-value matches
+        if len(task_match) == 1:
+            return_val = task_id
+            break
+    if len(task_match) == 0:
+        return_val = None
+    return return_val
+
+
 try:
     from sklearn.impute import SimpleImputer
 except ImportError:
@@ -275,4 +326,4 @@ def cat(X):
     return X.dtypes == "category"
 
 
-__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"]
+__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
diff --git a/openml/utils.py b/openml/utils.py
@@ -9,6 +9,7 @@
 from functools import wraps
 import collections
 
+import openml
 import openml._api_calls
 import openml.exceptions
 from . import config

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -36,6 +36,7 @@
     DATASETS_CACHE_DIR_NAME,
 )
 from openml.datasets import fork_dataset, edit_dataset
+from openml.tasks import TaskType, create_task
 
 
 class TestOpenMLDataset(TestBase):
@@ -414,9 +415,8 @@ def test__getarff_md5_issue(self):
         }
         self.assertRaisesRegex(
             OpenMLHashException,
-            "Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded file "
-            "is unequal to the expected checksum abc. "
-            "Raised when downloading dataset 5.",
+            "Checksum of downloaded file is unequal to the expected checksum abc when downloading "
+            "https://www.openml.org/data/download/61. Raised when downloading dataset 5.",
             _get_dataset_arff,
             description,
         )
@@ -498,6 +498,7 @@ def test_upload_dataset_with_url(self):
         )
         self.assertIsInstance(dataset.dataset_id, int)
 
+    @pytest.mark.flaky()
     def test_data_status(self):
         dataset = OpenMLDataset(
             "%s-UploadTestWithURL" % self._get_sentinel(),
@@ -1350,7 +1351,7 @@ def test_data_edit_errors(self):
             "original_data_url, default_target_attribute, row_id_attribute, "
             "ignore_attribute or paper_url to edit.",
             edit_dataset,
-            data_id=564,
+            data_id=64,  # blood-transfusion-service-center
         )
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
@@ -1360,15 +1361,32 @@ def test_data_edit_errors(self):
             data_id=999999,
             description="xor operation dataset",
         )
+
+        # Need to own a dataset to be able to edit meta-data
+        # Will be creating a forked version of an existing dataset to allow the unit test user
+        #  to edit meta-data of a dataset
+        did = fork_dataset(1)
+        self._wait_for_dataset_being_processed(did)
+        TestBase._mark_entity_for_removal("data", did)
+        # Need to upload a task attached to this data to test edit failure
+        task = create_task(
+            task_type=TaskType.SUPERVISED_CLASSIFICATION,
+            dataset_id=did,
+            target_name="class",
+            estimation_procedure_id=1,
+        )
+        task = task.publish()
+        TestBase._mark_entity_for_removal("task", task.task_id)
         # Check server exception when owner/admin edits critical fields of dataset with tasks
         self.assertRaisesRegex(
             OpenMLServerException,
             "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
             "can only be edited for datasets without any tasks.",
             edit_dataset,
-            data_id=223,
+            data_id=did,
             default_target_attribute="y",
         )
+
         # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
             OpenMLServerException,