Skip to content
Merged
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ env:
- TEST_DIR=/tmp/test_dir/
- MODULE=openml
matrix:
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
Expand Down
4 changes: 3 additions & 1 deletion doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ Changelog
~~~~~~

* FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
switching the server
switching the server.
* FIX #885: Logger no longer registered by default. Added utility functions to easily register
logging to console and file.
* MAINT #767: Source distribution installation is now unit-tested.
* MAINT #836: OpenML supports only pandas version 1.0.0 or above.
* MAINT #865: OpenML no longer bundles test files in the source distribution.
* MAINT #897: Dropping support for Python 3.5.
* ADD #894: Support caching of datasets using feather format as an option.

0.10.2
Expand Down
8 changes: 4 additions & 4 deletions examples/30_extended/create_upload_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,15 +283,15 @@


############################################################################
# Dataset is a pandas sparse dataframe
# ====================================
# Dataset is a pandas dataframe with sparse columns
# =================================================

sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
print(df.info())

xor_dataset = create_dataset(
Expand Down
2 changes: 1 addition & 1 deletion examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
# Get the actual data.
#
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
# sparse matrix, or as a Pandas DataFrame. The format is
# controlled with the parameter ``dataset_format`` which can be either 'array'
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
# and manually create a dataframe.
Expand Down
6 changes: 2 additions & 4 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,9 +551,7 @@ def _encode_if_category(column):
)
elif array_format == "dataframe":
if scipy.sparse.issparse(data):
return pd.SparseDataFrame(data, columns=attribute_names)
else:
return data
return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
else:
data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
logger.warning(
Expand Down Expand Up @@ -602,7 +600,7 @@ def get_data(
dataset_format : string (default='dataframe')
The format of returned dataset.
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
If ``dataframe``, the returned dataset will be a Pandas DataFrame.

Returns
-------
Expand Down
9 changes: 4 additions & 5 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ def create_dataset(name, description, creator, contributor,
class:`openml.OpenMLDataset`
Dataset description."""

if isinstance(data, (pd.DataFrame, pd.SparseDataFrame)):
if isinstance(data, pd.DataFrame):
# infer the row id from the index of the dataset
if row_id_attribute is None:
row_id_attribute = data.index.name
Expand All @@ -684,8 +684,7 @@ def create_dataset(name, description, creator, contributor,
if attributes == 'auto' or isinstance(attributes, dict):
if not hasattr(data, "columns"):
raise ValueError("Automatically inferring attributes requires "
"a pandas DataFrame or SparseDataFrame. "
"A {!r} was given instead.".format(data))
"a pandas DataFrame. A {!r} was given instead.".format(data))
# infer the type of data for each column of the DataFrame
attributes_ = attributes_arff_from_df(data)
if isinstance(attributes, dict):
Expand All @@ -708,8 +707,8 @@ def create_dataset(name, description, creator, contributor,
)

if hasattr(data, "columns"):
if isinstance(data, pd.SparseDataFrame):
data = data.to_coo()
if all(isinstance(dtype, pd.SparseDtype) for dtype in data.dtypes):
data = data.sparse.to_coo()
# liac-arff only support COO matrices with sorted rows
row_idx_sorted = np.argsort(data.row)
data.row = data.row[row_idx_sorted]
Expand Down
9 changes: 4 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
with open("openml/__version__.py") as fh:
version = fh.readlines()[-1].split()[-1].strip("\"'")

if sys.version_info < (3, 5):
if sys.version_info < (3, 6):
raise ValueError(
'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.'
'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.6 or higher.'
.format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
)

Expand Down Expand Up @@ -42,14 +42,14 @@
exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
),
package_data={'': ['*.txt', '*.md']},
python_requires=">=3.5",
python_requires=">=3.6",
install_requires=[
'liac-arff>=2.4.0',
'xmltodict',
'requests',
'scikit-learn>=0.18',
'python-dateutil', # Installed through pandas anyway.
'pandas>=0.19.2, <1.0.0',
'pandas>=1.0.0',
'scipy>=0.13.3',
'numpy>=1.6.2',
],
Expand Down Expand Up @@ -92,6 +92,5 @@
'Operating System :: Unix',
'Operating System :: MacOS',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7'])
4 changes: 3 additions & 1 deletion tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,9 @@ def test_get_sparse_dataset(self):

def test_get_sparse_dataframe(self):
rval, *_ = self.sparse_dataset.get_data()
self.assertTrue(isinstance(rval, pd.SparseDataFrame))
self.assertIsInstance(rval, pd.DataFrame)
np.testing.assert_array_equal(
[pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes), rval.dtypes)
self.assertEqual((600, 20001), rval.shape)

def test_get_sparse_dataset_with_rowid(self):
Expand Down
17 changes: 7 additions & 10 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,12 +561,9 @@ def test_attributes_arff_from_df(self):
('string', 'STRING'),
('category', ['A', 'B']),
('boolean', ['True', 'False'])])
# SparseDataFrame case
df = pd.SparseDataFrame([[1, 1.0],
[2, 2.0],
[0, 0]],
columns=['integer', 'floating'],
default_fill_value=0)
# DataFrame with Sparse columns case
df = pd.DataFrame({"integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0),
"floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0)})
df['integer'] = df['integer'].astype(np.int64)
attributes = attributes_arff_from_df(df)
self.assertEqual(attributes, [('integer', 'INTEGER'),
Expand Down Expand Up @@ -925,15 +922,15 @@ def test_create_dataset_pandas(self):
"Uploaded ARFF does not match original one"
)

# Check that SparseDataFrame are supported properly
# Check that DataFrame with Sparse columns are supported properly
sparse_data = scipy.sparse.coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
# meta-information
description = 'Synthetic dataset created from a Pandas SparseDataFrame'
description = 'Synthetic dataset created from a Pandas DataFrame with Sparse columns'
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
Expand Down