fix file upload

openml · mfeurer · Nov 3, 2020 · Oct 30, 2020 · Oct 30, 2020 · Oct 30, 2020
commit 0babe6ab48c537e0487e248388cc046e958cc49c
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -176,7 +176,8 @@ def find_invalid_characters(string, pattern):
             )
 
         self.cache_format = cache_format
-        self.data_format = data_format
+        # Has to be called format, otherwise there will be an XML upload error
+        self.format = data_format
         self.creator = creator
         self.contributor = contributor
         self.collection_date = collection_date
@@ -244,7 +245,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
         fields = {
             "Name": self.name,
             "Version": self.version,
-            "Format": self.data_format,
+            "Format": self.format,
             "Licence": self.licence,
             "Download URL": self.url,
             "Data file": self.data_file,
@@ -377,7 +378,7 @@ def _parse_data_from_arff(
             List[str]: List of column names.
         """
         try:
-            data = self._get_arff(self.data_format)
+            data = self._get_arff(self.format)
         except OSError as e:
             logger.critical(
                 "Please check that the data file {} is "
@@ -398,7 +399,7 @@ def _parse_data_from_arff(
         for i, (name, type_) in enumerate(data["attributes"]):
             # if the feature is nominal and a sparse matrix is
             # requested, the categories need to be numeric
-            if isinstance(type_, list) and self.data_format.lower() == "sparse_arff":
+            if isinstance(type_, list) and self.format.lower() == "sparse_arff":
                 try:
                     # checks if the strings which should be the class labels
                     # can be encoded into integers
@@ -408,7 +409,7 @@ def _parse_data_from_arff(
                         "Categorical data needs to be numeric when " "using sparse ARFF."
                     )
             # string can only be supported with pandas DataFrame
-            elif type_ == "STRING" and self.data_format.lower() == "sparse_arff":
+            elif type_ == "STRING" and self.format.lower() == "sparse_arff":
                 raise ValueError("Dataset containing strings is not supported " "with sparse ARFF.")
 
             # infer the dtype from the ARFF header
@@ -431,12 +432,12 @@ def _parse_data_from_arff(
                 attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
             attribute_names.append(name)
 
-        if self.data_format.lower() == "sparse_arff":
+        if self.format.lower() == "sparse_arff":
             X = data["data"]
             X_shape = (max(X[1]) + 1, max(X[2]) + 1)
             X = scipy.sparse.coo_matrix((X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
             X = X.tocsr()
-        elif self.data_format.lower() == "arff":
+        elif self.format.lower() == "arff":
             X = pd.DataFrame(data["data"], columns=attribute_names)
 
             col = []
@@ -460,7 +461,7 @@ def _parse_data_from_arff(
                     col.append(X[column_name])
             X = pd.concat(col, axis=1)
         else:
-            raise ValueError("Dataset format '{}' is not a valid format.".format(self.data_format))
+            raise ValueError("Dataset format '{}' is not a valid format.".format(self.format))
 
         return X, categorical, attribute_names
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -860,7 +860,7 @@ def test_get_online_dataset_arff(self):
         decoder = arff.ArffDecoder()
         # check if the arff from the dataset is
         # the same as the arff from _get_arff function
-        d_format = (dataset.data_format).lower()
+        d_format = (dataset.format).lower()
 
         self.assertEqual(
             dataset._get_arff(d_format),
@@ -879,7 +879,7 @@ def test_get_online_dataset_format(self):
         dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
 
         self.assertEqual(
-            (dataset.data_format).lower(),
+            (dataset.format).lower(),
             _get_online_dataset_format(dataset_id),
             "The format of the ARFF files is different",
         )