Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
5a19931
init feather implementation
sahithyaravi Nov 6, 2019
87907d9
Merge remote-tracking branch 'origin/develop' into feather_investigation
sahithyaravi Nov 8, 2019
6d2f5c3
sparse matrix
sahithyaravi Nov 8, 2019
33881ea
test notebook
sahithyaravi Nov 8, 2019
55743bd
feather pickle compare
sahithyaravi Nov 8, 2019
1437005
test arrow vs feather
sahithyaravi Nov 11, 2019
ef461d7
Merge remote-tracking branch 'origin/develop' into feather_investigation
sahithyaravi Nov 11, 2019
5c27237
add columns condition
sahithyaravi Nov 11, 2019
f61d9b5
Testing
sahithyaravi Nov 14, 2019
484869e
Merge branch 'develop' into feather_investigation
sahithyaravi Jan 6, 2020
3c513b0
get_dataset add cache format
sahithyaravi Jan 8, 2020
0b3d781
add pyarrow
sahithyaravi Jan 8, 2020
a9becf1
sparse matrix check
sahithyaravi Jan 8, 2020
aff8aff
pep8 and remove files
sahithyaravi Jan 8, 2020
48e2a16
return type
sahithyaravi Jan 8, 2020
19c22fe
fix type annotation
sahithyaravi Jan 8, 2020
98be055
value check
sahithyaravi Jan 8, 2020
112eb1d
change feather condition
sahithyaravi Jan 10, 2020
99fac3d
fixes and test
sahithyaravi Jan 11, 2020
cf3cbad
fix errors
sahithyaravi Jan 13, 2020
7583e88
Merge branch 'develop' into feather_investigation
sahithyaravi Jan 13, 2020
09d6bdb
testing file
sahithyaravi Jan 13, 2020
3aff927
feather new file for attributes
sahithyaravi Jan 14, 2020
b521534
change feather attribute file path
sahithyaravi Jan 14, 2020
8eb77cf
delete testing file
sahithyaravi Jan 14, 2020
4894bbd
testing changes
sahithyaravi Jan 14, 2020
b6839b1
delete pkls
sahithyaravi Jan 14, 2020
131bdad
fixes
sahithyaravi Jan 14, 2020
aeb9b98
fixes
sahithyaravi Jan 14, 2020
865d4dc
add comments
sahithyaravi Jan 15, 2020
701496f
change default caching
sahithyaravi Jan 22, 2020
f689897
pip version
sahithyaravi Jan 27, 2020
74f359e
review comment fixes
sahithyaravi Jan 29, 2020
19272e5
newline
sahithyaravi Jan 29, 2020
09a5469
fix if condition
sahithyaravi Jan 29, 2020
f0da5a1
Update install.sh
sahithyaravi Feb 3, 2020
ed8ca7b
pandas verison due to sparse data
sahithyaravi Feb 3, 2020
d7488f7
review #2
sahithyaravi Feb 11, 2020
d09c431
Update appveyor.yml
sahithyaravi Feb 17, 2020
bf44356
Update appveyor.yml
sahithyaravi Feb 18, 2020
e6bc0b0
rename cache dir
sahithyaravi Feb 18, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ environment:
# CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\scikit-learn-contrib\\run_with_env.cmd"

matrix:
- PYTHON: "C:\\Python35-x64"
PYTHON_VERSION: "3.5"
- PYTHON: "C:\\Python3-x64"
PYTHON_VERSION: "3.6"
PYTHON_ARCH: "64"
MINICONDA: "C:\\Miniconda35-x64"
MINICONDA: "C:\\Miniconda36-x64"

matrix:
fast_finish: true
Expand Down
2 changes: 1 addition & 1 deletion ci_scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fi
python --version

if [[ "$TEST_DIST" == "true" ]]; then
pip install twine nbconvert jupyter_client matplotlib pytest pytest-xdist pytest-timeout \
pip install twine nbconvert jupyter_client matplotlib pyarrow pytest pytest-xdist pytest-timeout \
nbformat oslo.concurrency flaky
python setup.py sdist
# Find file which was modified last as done in https://stackoverflow.com/a/4561987
Expand Down
1 change: 1 addition & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Changelog
logging to console and file.
* MAINT #767: Source distribution installation is now unit-tested.
* MAINT #865: OpenML no longer bundles test files in the source distribution.
* ADD #894: Support caching of datasets using feather format as an option.

0.10.2
~~~~~~
Expand Down
80 changes: 61 additions & 19 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class OpenMLDataset(OpenMLBase):
Description of the dataset.
format : str
Format of the dataset which can be either 'arff' or 'sparse_arff'.
cache_format : str
Format for caching the dataset which can be either 'feather' or 'pickle'.
dataset_id : int, optional
Id autogenerated by the server.
version : int, optional
Expand Down Expand Up @@ -99,7 +101,8 @@ class OpenMLDataset(OpenMLBase):
Serialized arff dataset string.
"""
def __init__(self, name, description, format=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm somewhat surprised that this one isn't annotated. @Neeratyoy could you please add to your stack to figure out why this is legal given that we have mypy running?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.
Just to confirm the task: I have to check why the missing annotation for cache_format was never caught.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct.

data_format='arff', dataset_id=None, version=None,
data_format='arff', cache_format='pickle',
dataset_id=None, version=None,
creator=None, contributor=None, collection_date=None,
upload_date=None, language=None, licence=None,
url=None, default_target_attribute=None,
Expand Down Expand Up @@ -127,6 +130,11 @@ def __init__(self, name, description, format=None,
self.name = name
self.version = int(version) if version is not None else None
self.description = description
if cache_format not in ['feather', 'pickle']:
raise ValueError("cache_format must be one of 'feather' or 'pickle. "
"Invalid format specified: {}".format(cache_format))

self.cache_format = cache_format
if format is None:
self.format = data_format
else:
Expand Down Expand Up @@ -180,9 +188,11 @@ def __init__(self, name, description, format=None,
self.qualities = _check_qualities(qualities)

if data_file is not None:
self.data_pickle_file = self._create_pickle_in_cache(data_file)
self.data_pickle_file, self.data_feather_file,\
self.feather_attribute_file = self._create_pickle_in_cache(data_file)
else:
self.data_pickle_file = None
self.data_pickle_file, self.data_feather_file, \
self.feather_attribute_file = None, None, None

@property
def id(self) -> Optional[int]:
Expand Down Expand Up @@ -396,18 +406,20 @@ def _parse_data_from_arff(

return X, categorical, attribute_names

def _create_pickle_in_cache(self, data_file: str) -> str:
def _create_pickle_in_cache(self, data_file: str) -> Tuple[str, str, str]:
""" Parse the arff and pickle the result. Update any old pickle objects. """
data_pickle_file = data_file.replace('.arff', '.pkl.py3')
if os.path.exists(data_pickle_file):
data_feather_file = data_file.replace('.arff', '.feather')
feather_attribute_file = data_file.replace('.arff', '.feather.attributes.pkl.py3')
if os.path.exists(data_pickle_file) and self.cache_format == 'pickle':
# Load the data to check if the pickle file is outdated (i.e. contains numpy array)
with open(data_pickle_file, "rb") as fh:
try:
data, categorical, attribute_names = pickle.load(fh)
except EOFError:
# The file is likely corrupt, see #780.
# We deal with this when loading the data in `_load_data`.
return data_pickle_file
return data_pickle_file, data_feather_file, feather_attribute_file

# Between v0.8 and v0.9 the format of pickled data changed from
# np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
Expand All @@ -416,32 +428,62 @@ def _create_pickle_in_cache(self, data_file: str) -> str:
# pd.DataFrame blob. See also #646.
if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
logger.debug("Data pickle file already exists and is up to date.")
return data_pickle_file
return data_pickle_file, data_feather_file, feather_attribute_file
elif os.path.exists(data_feather_file) and self.cache_format == 'feather':
# Load the data to check if the pickle file is outdated (i.e. contains numpy array)
try:
data = pd.read_feather(data_feather_file)
except EOFError:
# The file is likely corrupt, see #780.
# We deal with this when loading the data in `_load_data`.
return data_pickle_file, data_feather_file, feather_attribute_file

logger.debug("Data feather file already exists and is up to date.")
return data_pickle_file, data_feather_file, feather_attribute_file

# At this point either the pickle file does not exist, or it had outdated formatting.
# We parse the data from arff again and populate the cache with a recent pickle file.
X, categorical, attribute_names = self._parse_data_from_arff(data_file)

with open(data_pickle_file, "wb") as fh:
pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
logger.debug("Saved dataset {did}: {name} to file {path}"
.format(did=int(self.dataset_id or -1),
name=self.name,
path=data_pickle_file)
)
# Feather format does not work for sparse datasets, so we use pickle for sparse datasets

return data_pickle_file
if self.cache_format == "feather" and not scipy.sparse.issparse(X):
logger.info("feather write {}".format(self.name))
X.to_feather(data_feather_file)
with open(feather_attribute_file, "wb") as fh:
pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
else:
logger.info("pickle write {}".format(self.name))
self.cache_format = 'pickle'
with open(data_pickle_file, "wb") as fh:
pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
logger.debug("Saved dataset {did}: {name} to file {path}"
.format(did=int(self.dataset_id or -1),
name=self.name,
path=data_pickle_file)
)
return data_pickle_file, data_feather_file, feather_attribute_file

def _load_data(self):
""" Load data from pickle or arff. Download data first if not present on disk. """
if self.data_pickle_file is None:
if (self.cache_format == 'pickle' and self.data_pickle_file is None) or \
(self.cache_format == 'feather' and self.data_feather_file is None):
if self.data_file is None:
self._download_data()
self.data_pickle_file = self._create_pickle_in_cache(self.data_file)
self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = \
self._create_pickle_in_cache(self.data_file)

try:
with open(self.data_pickle_file, "rb") as fh:
data, categorical, attribute_names = pickle.load(fh)
if self.cache_format == 'feather':
logger.info("feather load data {}".format(self.name))
data = pd.read_feather(self.data_feather_file)

with open(self.feather_attribute_file, "rb") as fh:
categorical, attribute_names = pickle.load(fh)
else:
logger.info("pickle load data {}".format(self.name))
with open(self.data_pickle_file, "rb") as fh:
data, categorical, attribute_names = pickle.load(fh)
except EOFError:
logger.warning(
"Detected a corrupt cache file loading dataset %d: '%s'. "
Expand Down
18 changes: 15 additions & 3 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,8 @@ def get_dataset(
dataset_id: Union[int, str],
download_data: bool = True,
version: int = None,
error_if_multiple: bool = False
error_if_multiple: bool = False,
cache_format: str = 'pickle'
) -> OpenMLDataset:
""" Download the OpenML dataset representation, optionally also download actual data file.

Expand Down Expand Up @@ -479,12 +480,19 @@ def get_dataset(
If no version is specified, retrieve the least recent still active version.
error_if_multiple : bool, optional (default=False)
If ``True`` raise an error if multiple datasets are found with matching criteria.

cache_format : str, optional (default='pickle')
Format for caching the dataset - may be feather or pickle
Note that the default 'pickle' option may load slower than feather when
no.of.rows is very high.
Returns
-------
dataset : :class:`openml.OpenMLDataset`
The downloaded dataset.
"""
if cache_format not in ['feather', 'pickle']:
raise ValueError("cache_format must be one of 'feather' or 'pickle. "
"Invalid format specified: {}".format(cache_format))

if isinstance(dataset_id, str):
try:
dataset_id = int(dataset_id)
Expand Down Expand Up @@ -527,7 +535,7 @@ def get_dataset(
did_cache_dir)

dataset = _create_dataset_from_description(
description, features, qualities, arff_file
description, features, qualities, arff_file, cache_format
)
return dataset

Expand Down Expand Up @@ -975,6 +983,7 @@ def _create_dataset_from_description(
features: Dict,
qualities: List,
arff_file: str = None,
cache_format: str = 'pickle',
) -> OpenMLDataset:
"""Create a dataset object from a description dict.

Expand All @@ -988,6 +997,8 @@ def _create_dataset_from_description(
Description of a dataset qualities.
arff_file : string, optional
Path of dataset ARFF file.
cache_format: string, optional
Caching option for datasets (feather/pickle)

Returns
-------
Expand Down Expand Up @@ -1019,6 +1030,7 @@ def _create_dataset_from_description(
update_comment=description.get("oml:update_comment"),
md5_checksum=description.get("oml:md5_checksum"),
data_file=arff_file,
cache_format=cache_format,
features=features,
qualities=qualities,
)
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@
'requests',
'scikit-learn>=0.18',
'python-dateutil', # Installed through pandas anyway.
'pandas>=0.19.2',
'pandas>=0.19.2, <1.0.0',
'scipy>=0.13.3',
'numpy>=1.6.2'
'numpy>=1.6.2',
],
extras_require={
'test': [
Expand All @@ -64,6 +64,7 @@
'nbformat',
'oslo.concurrency',
'flaky',
'pyarrow'
],
'examples': [
'matplotlib',
Expand Down
39 changes: 39 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,3 +1316,42 @@ def test_list_qualities(self):
qualities = openml.datasets.list_qualities()
self.assertEqual(isinstance(qualities, list), True)
self.assertEqual(all([isinstance(q, str) for q in qualities]), True)

def test_get_dataset_cache_format_pickle(self):
dataset = openml.datasets.get_dataset(1)
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.name, 'anneal')
self.assertGreater(len(dataset.features), 1)
self.assertGreater(len(dataset.qualities), 4)

X, y, categorical, attribute_names = dataset.get_data()
self.assertIsInstance(X, pd.DataFrame)
self.assertEqual(X.shape, (898, 39))
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])

def test_get_dataset_cache_format_feather(self):

dataset = openml.datasets.get_dataset(128, cache_format='feather')

# Check if dataset is written to cache directory using feather
cache_dir = openml.config.get_cache_directory()
cache_dir_for_id = os.path.join(cache_dir, 'datasets', '128')
feather_file = os.path.join(cache_dir_for_id, 'dataset.feather')
pickle_file = os.path.join(cache_dir_for_id, 'dataset.feather.attributes.pkl.py3')
data = pd.read_feather(feather_file)
self.assertTrue(os.path.isfile(feather_file), msg='Feather file is missing')
self.assertTrue(os.path.isfile(pickle_file), msg='Attributes pickle file is missing')
self.assertEqual(data.shape, (150, 5))

# Check if get_data is able to retrieve feather data
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.name, 'iris')
self.assertGreater(len(dataset.features), 1)
self.assertGreater(len(dataset.qualities), 4)

X, y, categorical, attribute_names = dataset.get_data()
self.assertIsInstance(X, pd.DataFrame)
self.assertEqual(X.shape, (150, 5))
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])