@@ -901,7 +901,6 @@ def test_create_dataset_pandas(self):
901901 collection_date = "01-01-2018"
902902 language = "English"
903903 licence = "MIT"
904- default_target_attribute = "play"
905904 citation = "None"
906905 original_data_url = "http://openml.github.io/openml-python"
907906 paper_url = "http://openml.github.io/openml-python"
@@ -913,7 +912,7 @@ def test_create_dataset_pandas(self):
913912 collection_date = collection_date ,
914913 language = language ,
915914 licence = licence ,
916- default_target_attribute = default_target_attribute ,
915+ default_target_attribute = "play" ,
917916 row_id_attribute = None ,
918917 ignore_attribute = None ,
919918 citation = citation ,
@@ -948,7 +947,7 @@ def test_create_dataset_pandas(self):
948947 collection_date = collection_date ,
949948 language = language ,
950949 licence = licence ,
951- default_target_attribute = default_target_attribute ,
950+ default_target_attribute = "y" ,
952951 row_id_attribute = None ,
953952 ignore_attribute = None ,
954953 citation = citation ,
@@ -984,7 +983,7 @@ def test_create_dataset_pandas(self):
984983 collection_date = collection_date ,
985984 language = language ,
986985 licence = licence ,
987- default_target_attribute = default_target_attribute ,
986+ default_target_attribute = "rnd_str" ,
988987 row_id_attribute = None ,
989988 ignore_attribute = None ,
990989 citation = citation ,
@@ -1420,3 +1419,118 @@ def test_data_fork(self):
14201419 self .assertRaisesRegex (
14211420 OpenMLServerException , "Unknown dataset" , fork_dataset , data_id = 999999 ,
14221421 )
1422+
1423+
1424+ @pytest .mark .parametrize (
1425+ "default_target_attribute,row_id_attribute,ignore_attribute" ,
1426+ [
1427+ ("wrong" , None , None ),
1428+ (None , "wrong" , None ),
1429+ (None , None , "wrong" ),
1430+ ("wrong,sunny" , None , None ),
1431+ (None , None , "wrong,sunny" ),
1432+ (["wrong" , "sunny" ], None , None ),
1433+ (None , None , ["wrong" , "sunny" ]),
1434+ ],
1435+ )
1436+ def test_invalid_attribute_validations (
1437+ default_target_attribute , row_id_attribute , ignore_attribute
1438+ ):
1439+ data = [
1440+ ["a" , "sunny" , 85.0 , 85.0 , "FALSE" , "no" ],
1441+ ["b" , "sunny" , 80.0 , 90.0 , "TRUE" , "no" ],
1442+ ["c" , "overcast" , 83.0 , 86.0 , "FALSE" , "yes" ],
1443+ ["d" , "rainy" , 70.0 , 96.0 , "FALSE" , "yes" ],
1444+ ["e" , "rainy" , 68.0 , 80.0 , "FALSE" , "yes" ],
1445+ ]
1446+ column_names = ["rnd_str" , "outlook" , "temperature" , "humidity" , "windy" , "play" ]
1447+ df = pd .DataFrame (data , columns = column_names )
1448+ # enforce the type of each column
1449+ df ["outlook" ] = df ["outlook" ].astype ("category" )
1450+ df ["windy" ] = df ["windy" ].astype ("bool" )
1451+ df ["play" ] = df ["play" ].astype ("category" )
1452+ # meta-information
1453+ name = "pandas_testing_dataset"
1454+ description = "Synthetic dataset created from a Pandas DataFrame"
1455+ creator = "OpenML tester"
1456+ collection_date = "01-01-2018"
1457+ language = "English"
1458+ licence = "MIT"
1459+ citation = "None"
1460+ original_data_url = "http://openml.github.io/openml-python"
1461+ paper_url = "http://openml.github.io/openml-python"
1462+ with pytest .raises (ValueError , match = "should be one of the data attribute" ):
1463+ _ = openml .datasets .functions .create_dataset (
1464+ name = name ,
1465+ description = description ,
1466+ creator = creator ,
1467+ contributor = None ,
1468+ collection_date = collection_date ,
1469+ language = language ,
1470+ licence = licence ,
1471+ default_target_attribute = default_target_attribute ,
1472+ row_id_attribute = row_id_attribute ,
1473+ ignore_attribute = ignore_attribute ,
1474+ citation = citation ,
1475+ attributes = "auto" ,
1476+ data = df ,
1477+ version_label = "test" ,
1478+ original_data_url = original_data_url ,
1479+ paper_url = paper_url ,
1480+ )
1481+
1482+
1483+ @pytest .mark .parametrize (
1484+ "default_target_attribute,row_id_attribute,ignore_attribute" ,
1485+ [
1486+ ("outlook" , None , None ),
1487+ (None , "outlook" , None ),
1488+ (None , None , "outlook" ),
1489+ ("outlook,windy" , None , None ),
1490+ (None , None , "outlook,windy" ),
1491+ (["outlook" , "windy" ], None , None ),
1492+ (None , None , ["outlook" , "windy" ]),
1493+ ],
1494+ )
1495+ def test_valid_attribute_validations (default_target_attribute , row_id_attribute , ignore_attribute ):
1496+ data = [
1497+ ["a" , "sunny" , 85.0 , 85.0 , "FALSE" , "no" ],
1498+ ["b" , "sunny" , 80.0 , 90.0 , "TRUE" , "no" ],
1499+ ["c" , "overcast" , 83.0 , 86.0 , "FALSE" , "yes" ],
1500+ ["d" , "rainy" , 70.0 , 96.0 , "FALSE" , "yes" ],
1501+ ["e" , "rainy" , 68.0 , 80.0 , "FALSE" , "yes" ],
1502+ ]
1503+ column_names = ["rnd_str" , "outlook" , "temperature" , "humidity" , "windy" , "play" ]
1504+ df = pd .DataFrame (data , columns = column_names )
1505+ # enforce the type of each column
1506+ df ["outlook" ] = df ["outlook" ].astype ("category" )
1507+ df ["windy" ] = df ["windy" ].astype ("bool" )
1508+ df ["play" ] = df ["play" ].astype ("category" )
1509+ # meta-information
1510+ name = "pandas_testing_dataset"
1511+ description = "Synthetic dataset created from a Pandas DataFrame"
1512+ creator = "OpenML tester"
1513+ collection_date = "01-01-2018"
1514+ language = "English"
1515+ licence = "MIT"
1516+ citation = "None"
1517+ original_data_url = "http://openml.github.io/openml-python"
1518+ paper_url = "http://openml.github.io/openml-python"
1519+ _ = openml .datasets .functions .create_dataset (
1520+ name = name ,
1521+ description = description ,
1522+ creator = creator ,
1523+ contributor = None ,
1524+ collection_date = collection_date ,
1525+ language = language ,
1526+ licence = licence ,
1527+ default_target_attribute = default_target_attribute ,
1528+ row_id_attribute = row_id_attribute ,
1529+ ignore_attribute = ignore_attribute ,
1530+ citation = citation ,
1531+ attributes = "auto" ,
1532+ data = df ,
1533+ version_label = "test" ,
1534+ original_data_url = original_data_url ,
1535+ paper_url = paper_url ,
1536+ )
0 commit comments