1+ package test_dataframes
2+
3+ import com.google.common.base.Stopwatch
4+ import krangl.*
5+
6+ /* *
7+ * Test the API of krangl to do some basic dataframe manipulations.
8+ *
9+ * https://github.com/holgerbrandl/krangl
10+ *
11+ * See https://medium.com/@thijser/doing-cool-data-science-in-java-how-3-dataframe-libraries-stack-up-5e6ccb7b437
12+ * for more information.
13+ */
14+ fun main () {
15+ val data = DataFrame .readCSV(" urb_cpop1_1_Data.csv" )
16+
17+ val watch = Stopwatch .createStarted()
18+ // remove missing values indicated with ":", convert column to IntCol
19+ val filtered = data.filter { ! (it[" Value" ] eq " :" ) }.addColumn(" Value" ) {
20+ it[" Value" ].map(String ::toInt)
21+ }
22+ // replace duplicated rows with mean value, create pivot table
23+ val cities = filtered.groupBy(" CITIES" , " INDIC_UR" , " TIME" )
24+ .summarize(" Value" to { it[" Value" ].mean() })
25+ .spread(" TIME" , " Value" ).filter {
26+ it[" INDIC_UR" ].isMatching<String > { endsWith(" January, total" ) }
27+ }
28+
29+ println (cities.select(" CITIES" , " 2017" ).sortedByDescending(" 2017" ).head(10 ))
30+
31+ val highestGrowthTable = cities.addColumn(" growth" ) {
32+ (it[" 2016" ] / it[" 2010" ] - 1.0 ) * 100.0
33+ }.sortedByDescending(" growth" )
34+
35+ println (highestGrowthTable.select(" CITIES" , " growth" ).head(10 ))
36+
37+ CheckResult .checkResult(highestGrowthTable[" CITIES" ].asStrings().toList())
38+ println (" Total time: $watch " )
39+ }
0 commit comments