[KDF] finished tests and implementation for the parse operation
diff --git a/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/parse.kt b/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/parse.kt index 9690f12..6a35a25 100644 --- a/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/parse.kt +++ b/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/impl/api/parse.kt
@@ -5,38 +5,90 @@ package org.jetbrains.kotlinx.dataframe.plugin.impl.api -import org.jetbrains.kotlinx.dataframe.plugin.impl.AbstractSchemaModificationInterpreter -import org.jetbrains.kotlinx.dataframe.plugin.impl.Arguments -import org.jetbrains.kotlinx.dataframe.plugin.impl.PluginDataFrameSchema -import org.jetbrains.kotlinx.dataframe.plugin.impl.Present -import org.jetbrains.kotlinx.dataframe.plugin.impl.dataFrame -import org.jetbrains.kotlinx.dataframe.plugin.impl.ignore +import org.jetbrains.kotlin.fir.SessionHolder +import org.jetbrains.kotlin.fir.types.isCharOrNullableChar +import org.jetbrains.kotlin.fir.types.isMarkedNullable +import org.jetbrains.kotlin.fir.types.isNullableString +import org.jetbrains.kotlin.fir.types.isString +import org.jetbrains.kotlinx.dataframe.columns.toColumnSet +import org.jetbrains.kotlinx.dataframe.plugin.extensions.wrap +import org.jetbrains.kotlinx.dataframe.plugin.impl.* +/** + * `df.parse { ... }` + */ class Parse : AbstractSchemaModificationInterpreter() { val Arguments.receiver: PluginDataFrameSchema by dataFrame() val Arguments.options by ignore() val Arguments.columns: ColumnsResolver by arg() - override fun Arguments.interpret(): PluginDataFrameSchema { - TODO("Not yet implemented") - } + override fun Arguments.interpret(): PluginDataFrameSchema = + receiver.convertAsColumn(columns) { it.changeParsableType() } } -class StringParse : AbstractSchemaModificationInterpreter() { +/** + * `df.parse("a", "b")` + */ +class ParseString : AbstractSchemaModificationInterpreter() { val Arguments.receiver: PluginDataFrameSchema by dataFrame() val Arguments.options by ignore() val Arguments.columns: List<String> by arg(defaultValue = Present(emptyList())) - override fun Arguments.interpret(): PluginDataFrameSchema { - TODO("Not yet implemented") - } + override fun Arguments.interpret(): PluginDataFrameSchema = + receiver + .insertImpliedColumns(columns) + .convertAsColumn(columnsResolver { columns.toColumnSet() }) { + it.changeParsableType() + } } +/** + * `df.parse()` + */ class ParseDefault : AbstractSchemaModificationInterpreter() { val Arguments.receiver: PluginDataFrameSchema by dataFrame() val Arguments.options by ignore() - override fun Arguments.interpret(): PluginDataFrameSchema { - TODO("Not yet implemented") + override fun Arguments.interpret(): PluginDataFrameSchema = + PluginDataFrameSchema( + // We simply take the top-level columns. + // In the library implementation `parse()` calls `colsAtAnyDepth()` + // but since `changeParsableType()` is recursive anyway, the result is the same + columns = receiver.columns().map { it.changeParsableType() }, + ) +} + +private fun SimpleDataColumn.canBeParsed() = + type.coneType.let { + it.isString || it.isNullableString || it.isCharOrNullableChar } -} \ No newline at end of file + +/** + * Changes column types: + * - `String(?)` -> `Any(?)` + * - `Char(?)` -> `Any(?)` + * + * For this column, or for all columns under this column at any depth. + */ +context(context: SessionHolder) +private fun SimpleCol.changeParsableType(): SimpleCol = + when (this) { + is SimpleColumnGroup -> + this.copy( + columns = columns().map { it.changeParsableType() }, + ) + is SimpleFrameColumn -> + this.copy( + columns = columns().map { it.changeParsableType() }, + ) + is SimpleDataColumn if canBeParsed() -> + this.copy( + type = + when (type.coneType.isMarkedNullable) { + true -> context.session.builtinTypes.nullableAnyType + false -> context.session.builtinTypes.anyType + }.coneType.wrap(), + ) + + is SimpleDataColumn -> this + } \ No newline at end of file
diff --git a/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt b/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt index 745ed4b..a5b29e1 100644 --- a/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt +++ b/plugins/kotlin-dataframe/kotlin-dataframe.k2/src/org/jetbrains/kotlinx/dataframe/plugin/loadInterpreter.kt
@@ -249,6 +249,7 @@ import org.jetbrains.kotlinx.dataframe.plugin.impl.api.PairConstructor import org.jetbrains.kotlinx.dataframe.plugin.impl.api.PairToConstructor import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Parse +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.ParseDefault import org.jetbrains.kotlinx.dataframe.plugin.impl.api.PathOf import org.jetbrains.kotlinx.dataframe.plugin.impl.api.PerRowCol import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Percentile0 @@ -298,7 +299,7 @@ import org.jetbrains.kotlinx.dataframe.plugin.impl.api.StringInvokeUntyped import org.jetbrains.kotlinx.dataframe.plugin.impl.api.StringNestedCol import org.jetbrains.kotlinx.dataframe.plugin.impl.api.StringNestedColUntyped -import org.jetbrains.kotlinx.dataframe.plugin.impl.api.StringParse +import org.jetbrains.kotlinx.dataframe.plugin.impl.api.ParseString import org.jetbrains.kotlinx.dataframe.plugin.impl.api.StringSelect import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Sum0 import org.jetbrains.kotlinx.dataframe.plugin.impl.api.Sum1 @@ -691,7 +692,8 @@ "ColumnPathSelect" -> ColumnPathSelect() "PathOf" -> PathOf() "Parse" -> Parse() - "StringParse" -> StringParse() + "ParseString" -> ParseString() + "ParseDefault" -> ParseDefault() else -> if (isTest) error(this) else null } }
diff --git a/plugins/kotlin-dataframe/testData/box/parse.kt b/plugins/kotlin-dataframe/testData/box/parse.kt index e69de29..9d1d016 100644 --- a/plugins/kotlin-dataframe/testData/box/parse.kt +++ b/plugins/kotlin-dataframe/testData/box/parse.kt
@@ -0,0 +1,115 @@ +import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.schema.* +import kotlin.reflect.typeOf + +fun box(): String { + val nestedDf = dataFrameOf( + "doubles" to columnOf("1.0", "2.4", null), + "chars" to columnOf('4', '5', '6'), + "nullableChars" to columnOf('7', null, '8'), + "nonParse" to columnOf(7, 8, 9), + ) + + val df = dataFrameOf( + "nonParse" to columnOf(1), + "booleans" to columnOf("true"), + "group" to columnOf( + "nonParse" to columnOf(1), + "url" to columnOf("https://example.com"), + "long" to columnOf("1234567890"), + "nested" to columnOf(nestedDf), + ), + ) + + df.parse().compileTimeSchema().columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["booleans"]!!.type == typeOf<Any>()) + (it["group"] as ColumnSchema.Group).schema.columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["url"]!!.type == typeOf<Any>()) + assert(it["long"]!!.type == typeOf<Any>()) + (it["nested"] as ColumnSchema.Frame).schema.columns.let { + assert(it["doubles"]!!.type == typeOf<Any?>()) + assert(it["chars"]!!.type == typeOf<Any>()) + assert(it["nullableChars"]!!.type == typeOf<Any?>()) + assert(it["nonParse"]!!.type == typeOf<Int>()) + } + } + } + df.parse(options = ParserOptions()).compileTimeSchema().columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["booleans"]!!.type == typeOf<Any>()) + (it["group"] as ColumnSchema.Group).schema.columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["url"]!!.type == typeOf<Any>()) + assert(it["long"]!!.type == typeOf<Any>()) + (it["nested"] as ColumnSchema.Frame).schema.columns.let { + assert(it["doubles"]!!.type == typeOf<Any?>()) + assert(it["chars"]!!.type == typeOf<Any>()) + assert(it["nullableChars"]!!.type == typeOf<Any?>()) + assert(it["nonParse"]!!.type == typeOf<Int>()) + } + } + } + df.parse { valueCols() }.compileTimeSchema().columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["booleans"]!!.type == typeOf<Any>()) + (it["group"] as ColumnSchema.Group).schema.columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["url"]!!.type == typeOf<String>()) + assert(it["long"]!!.type == typeOf<String>()) + (it["nested"] as ColumnSchema.Frame).schema.columns.let { + assert(it["doubles"]!!.type == typeOf<String?>()) + assert(it["chars"]!!.type == typeOf<Char>()) + assert(it["nullableChars"]!!.type == typeOf<Char?>()) + assert(it["nonParse"]!!.type == typeOf<Int>()) + } + } + } + df.parse(ParserOptions()) { booleans and group.nested }.compileTimeSchema().columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["booleans"]!!.type == typeOf<Any>()) + (it["group"] as ColumnSchema.Group).schema.columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["url"]!!.type == typeOf<String>()) + assert(it["long"]!!.type == typeOf<String>()) + (it["nested"] as ColumnSchema.Frame).schema.columns.let { + assert(it["doubles"]!!.type == typeOf<Any?>()) + assert(it["chars"]!!.type == typeOf<Any>()) + assert(it["nullableChars"]!!.type == typeOf<Any?>()) + assert(it["nonParse"]!!.type == typeOf<Int>()) + } + } + } + df.parse("booleans", "nonParse").compileTimeSchema().columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) // nothing to parse + assert(it["booleans"]!!.type == typeOf<Any>()) + (it["group"] as ColumnSchema.Group).schema.columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["url"]!!.type == typeOf<String>()) + assert(it["long"]!!.type == typeOf<String>()) + (it["nested"] as ColumnSchema.Frame).schema.columns.let { + assert(it["doubles"]!!.type == typeOf<String?>()) + assert(it["chars"]!!.type == typeOf<Char>()) + assert(it["nullableChars"]!!.type == typeOf<Char?>()) + assert(it["nonParse"]!!.type == typeOf<Int>()) + } + } + } + df.parse("group", options = ParserOptions()).compileTimeSchema().columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["booleans"]!!.type == typeOf<String>()) + (it["group"] as ColumnSchema.Group).schema.columns.let { + assert(it["nonParse"]!!.type == typeOf<Int>()) + assert(it["url"]!!.type == typeOf<Any>()) + assert(it["long"]!!.type == typeOf<Any>()) + (it["nested"] as ColumnSchema.Frame).schema.columns.let { + assert(it["doubles"]!!.type == typeOf<Any?>()) + assert(it["chars"]!!.type == typeOf<Any>()) + assert(it["nullableChars"]!!.type == typeOf<Any?>()) + assert(it["nonParse"]!!.type == typeOf<Int>()) + } + } + } + return "OK" +}