Arjen Markus (4 december 2018) I was inspired to write the program below when using the Python extension pandas during a workshop on data science. My first implementation was a Tcl-only program, but I quickly turned to tarray as the underlying library, simply because it makes the various manipulations much easier to implement. I have been contemplating a similar implementation using SQLite, though that would probably put some strain on my knowledge of SQL ;).
Of course the code below is merely a humble beginning - it is nowhere near the feature richness that would be needed for a "true" package for data manipulation, but I think it is a good start. What is more, the data file I have been using to develop and mildly test this program, contains 54000 rows and the time it takes 1.3 seconds - most of which is spent on the two selection steps at the end. Without these two it take 0.3 seconds. Not all that bad, I'd say.
Note that the heavy duty work is all done by tarray and perhaps even more can be done by this library or its companion, xtal. So the program below, the class that is defined, is a front-end mostly.
Some caveats:
# dataobj_tarray.tcl -- # Class to hold and manipulate columnar data # (Inspired by Python's pandas. The underlying implementation # is based on Tarray) # # Issues: # - "join [::tarray::table get -list $tableValue $r] \t" does not produce a string # containing tabs - is a proper list returned by [table get -list]? # - index to [table get] cannot be "end" # - an empty value for a column of type double causes an error # # To do: # - add or delete a column # - correlation between two columns # - correlation matrix # lappend auto_path ../tarray package require tarray oo::class create dataObj { variable tableValue ;# tarray table stores names and values constructor {} { variable tableValue set tableValue {} } method readcsv {csvfile} { variable column variable tableValue set infile [open $csvfile] # # Read the first two lines: # - The first line contains the column names # - The second line is used to guess the type of values # (not a foolproof method, but useable as a simple heuristic) # gets $infile line set columnNames [split $line ,] gets $infile line set firstValues [split $line ,] set tableDefinition {} foreach c $columnNames f $firstValues { lappend tableDefinition $c if { [string is integer $f] } { #lappend tableDefinition wide -- the heuristic failed on dirty_diamonds.csv lappend tableDefinition double } elseif { [string is double $f] } { lappend tableDefinition double } else { lappend tableDefinition string } } set tableValue [::tarray::table create $tableDefinition] while { [gets $infile line] >= 0 } { if { [catch { ::tarray::table vfill tableValue [split $line ,] end+1 }] } { set inf [expr {1.0/0.0}] set row {} foreach {column type} $tableDefinition v [split $line ,] { if { $v eq "" && $type eq "double" } { lappend row $inf } else { lappend row $v } } ::tarray::table vfill tableValue $row end+1 } } } method print {{number {}}} { variable tableValue puts [join [::tarray::table cnames $tableValue] \t] if { $number eq {} } { set rows [::tarray::table size $tableValue] } else { set rows $number } for {set r 0} {$r < $rows} {incr r} { puts [join [::tarray::table index $tableValue $r] \t] } } # Return the unique category names method categories {columnName} { variable tableValue return [lsort -unique \ [::tarray::column range -list [::tarray::table column $tableValue $columnName] 0 end]] } # Return the extremes for a numerical column method extremes {columnName} { variable tableValue return [::tarray::column minmax [::tarray::column search -inline -all -lt \ [::tarray::table column $tableValue $columnName] Inf]] } # Return a histogram method histogram {columnName number {min {}} {max {}}} { variable tableValue set options {} if { $min ne {} } { set options "-min $min" } if { $max ne {} } { set options "$options -max $max" } set histogram [::tarray::column histogram {*}$options [::tarray::column search -inline -all -lt \ [::tarray::table column $tableValue $columnName] Inf] $number] set overview {} for {set row 0} {$row < $number} {incr row} { lappend overview [::tarray::table index $histogram $row] } return $overview } method normalise {columnName} { set column [::tarray::table column $tableValue $columnName] set column1 [::tarray::column search -inline -all -lt $column Inf] set sum1 [::tarray::column sum $column1] set sum2 [::tarray::column sum [::tarray::column math * $column1 $column1]] set offset [expr {$sum1 / double([::tarray::column size $column1])}] set scale [expr {sqrt($sum2 / ([::tarray::column size $column1]-1.0))}] set column [::tarray::column math / [::tarray::column math - $column $offset] $scale] ::tarray::table vcolumn tableValue $columnName $column } method normScaled {columnName} { set column [::tarray::table column $tableValue $columnName] set column1 [::tarray::column search -inline -all -lt $column Inf] lassign [::tarray::column minmax $column1] min max set offset [expr {$min}] set scale [expr {$max - $min}] set column [::tarray::column math / [::tarray::column math - $column $offset] $scale] ::tarray::table vcolumn tableValue $columnName $column } method quantile {columnName quant} { set column [::tarray::table column $tableValue $columnName] set column1 [::tarray::column search -inline -all -lt $column Inf] set qindex [expr { int($quant/100.0 * [::tarray::column size $column1]) }] return [::tarray::column get -list [::tarray::column sort $column1] $qindex] } method copy {{newname {}}} { if { $newname eq {} } { return [oo::copy [self object]] } else { return [oo::copy [self object] $newname] } } method info {} { set columns {} foreach column [::tarray::table cnames $tableValue] { lappend columns [list $column [::tarray::table ctype $tableValue $column]] } set info [dict create "rows" [::tarray::table size $tableValue] "columns" $columns] return $info } # # tarray does not support "greater-equal" or "lower-equal" method selectrows {columnName operation limit} { switch -- $operation { "equals" - "-eq" - "==" { set operation "-not" } "notequal" - "-ne" - "-not" - "!=" { set operation "-eq" } "greater" - "-gt" - ">" { set operation "-lt" ;# should have been -le } "greater-equal" - "-ge" - ">=" { set operation "-lt" } "lower" - "-lt" - "<" { set operation "-gt" ;# should have been -ge } "lower-equal" - "-le" - "<=" { set operation "-gt" } "between" { set operation "between" } default { return -code error "Unknown operation: $operation" } } if { $operation != "between"} { set indices [::tarray::column search -all $operation [::tarray::table column $tableValue $columnName] $limit] ::tarray::table vdelete tableValue $indices } else { lassign $limit min max set indices [::tarray::column search -all -lt [::tarray::table column $tableValue $columnName] $min] ::tarray::table vdelete tableValue $indices set indices [::tarray::column search -all -gt [::tarray::table column $tableValue $columnName] $max] ::tarray::table vdelete tableValue $indices } } } dataObj create myData #myData readcsv "testje.csv" myData readcsv "dirty_diamonds.csv" ;#-- does not work because of empty value myData print 100 puts "Values for \"cut\": [myData categories "cut"]" puts "Values for \"clarity\": [myData categories "clarity"]" puts "50%: [myData quantile price 50]" puts "Extremes: [myData extremes "price"]" puts "Histogram: [myData histogram "price" 10]" puts "Histogram: [myData histogram "price" 10 100 2000]" puts "Normalise by standard deviation:" myData normalise price puts "Extremes: [myData extremes "price"]" puts "Normalise by min/max:" myData normScaled price puts "Extremes: [myData extremes "price"]" # Copy the object set newData [myData copy] puts "Extremes newData: [$newData extremes "price"]" puts "Information on the table:" set info [$newData info] foreach key [dict keys $info] { puts "$key: [join [dict get $info $key] \n]" } # Select all rows where "cut" is ideal ... #myData selectrows cut equals "Ideal" puts "Rows with cut = Ideal: [dict get [myData info] rows]" #myData selectrows carat greater 0.5 puts "Rows with cut = Ideal and carat > 0.5: [dict get [myData info] rows]"
The data file that I refer to looks like this:
no,carat,cut,color,clarity,depth,table,price,x,y,z 1,0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43 2,0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31 3,0.23,n/a,E,VS1,56.9,65,327,4.05,4.07,2.31 4,0.29,,I,VS2,62.4,58,334,4.2,4.23,2.63 5,0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75 6,0.24,Very Good,J,VVS2,62.8,57,336,3.94,3.96,2.48 7,0.24,Very Good,I,VVS1,62.3,57,336,3.95,3.98,2.47 8,0.26,Very Good,H,SI1,61.9,55,337,4.07,4.11,2.53 9,0.22,Fair,E,VS2,65.1,61,337,3.87,3.78,2.49 ...
The output from the program, using the complete file is:
no carat cut color clarity depth table price x y z 2.0 0.21 Premium E SI1 59.8 61.0 326.0 3.89 3.84 2.31 3.0 0.23 n/a E VS1 56.9 65.0 327.0 4.05 4.07 2.31 4.0 0.29 I VS2 62.4 58.0 334.0 4.2 4.23 2.63 5.0 0.31 Good J SI2 63.3 58.0 335.0 4.34 4.35 2.75 6.0 0.24 Very Good J VVS2 62.8 57.0 336.0 3.94 3.96 2.48 7.0 0.24 Very Good I VVS1 62.3 57.0 336.0 3.95 3.98 2.47 8.0 0.26 Very Good H SI1 61.9 55.0 337.0 4.07 4.11 2.53 9.0 0.22 Fair E VS2 65.1 61.0 337.0 3.87 3.78 2.49 10.0 0.23 Very Good H VS1 59.4 61.0 338.0 4.0 4.05 2.39 11.0 0.3 Good J SI1 64.0 55.0 339.0 4.25 4.28 2.73 12.0 0.23 Ideal J VS1 62.8 56.0 340.0 3.93 3.9 2.46 13.0 0.22 Premium F SI1 60.4 61.0 342.0 3.88 3.84 2.33 14.0 0.31 Ideal J SI2 62.2 54.0 344.0 4.35 4.37 2.71 15.0 0.2 Premium E SI2 60.2 62.0 345.0 3.79 3.75 2.27 16.0 0.32 Premium E I1 60.9 58.0 345.0 4.38 4.42 2.68 17.0 0.3 Ideal I SI2 62.0 54.0 348.0 4.31 4.34 2.68 18.0 0.3 Good J SI1 63.4 54.0 351.0 4.23 4.29 2.7 19.0 0.3 Good J SI1 63.8 56.0 351.0 4.23 4.26 2.71 20.0 0.3 Very Good J SI1 62.7 59.0 351.0 4.21 4.27 2.66 21.0 0.3 Good I SI2 63.3 56.0 351.0 4.26 4.3 2.71 22.0 0.23 Very Good E VS2 63.8 55.0 352.0 3.85 3.92 2.48 23.0 0.23 Very Good H VS1 61.0 57.0 353.0 3.94 3.96 2.41 24.0 0.31 Very Good J SI1 59.4 62.0 353.0 4.39 4.43 2.62 25.0 0.31 Very Good J SI1 58.1 62.0 353.0 4.44 4.47 2.59 26.0 0.23 Very Good G VVS2 60.4 58.0 354.0 3.97 4.01 2.41 27.0 0.24 Premium I VS1 62.5 57.0 355.0 3.97 3.94 2.47 28.0 0.3 Very Good J VS2 62.2 57.0 357.0 4.28 4.3 2.67 29.0 0.23 Very Good D VS2 60.5 61.0 357.0 3.96 3.97 2.4 30.0 0.23 Very Good F VS1 60.9 57.0 357.0 3.96 3.99 2.42 31.0 0.23 Good F VS1 60.0 57.0 402.0 4.0 4.03 2.41 32.0 0.23 Very Good F VS1 59.8 57.0 402.0 4.04 4.06 2.42 33.0 0.23 Very Good E VS1 60.7 59.0 402.0 3.97 4.01 2.42 34.0 0.23 Very Good E VS1 59.5 58.0 402.0 4.01 4.06 2.4 35.0 0.23 Very Good D VS1 61.9 58.0 402.0 3.92 3.96 2.44 36.0 0.23 Good F VS1 58.2 59.0 402.0 4.06 4.08 2.37 37.0 0.23 Good E VS1 64.1 59.0 402.0 3.83 3.85 2.46 38.0 0.31 Good H SI1 64.0 54.0 402.0 4.29 4.31 2.75 39.0 0.26 Very Good D VS2 60.8 59.0 403.0 4.13 4.16 2.52 40.0 0.33 Ideal I SI2 61.8 55.0 403.0 4.49 4.51 2.78 41.0 0.33 Ideal I SI2 61.2 56.0 403.0 4.49 4.5 2.75 42.0 0.33 Ideal J SI1 61.1 56.0 403.0 4.49 4.55 2.76 43.0 0.26 Good D VS2 65.2 56.0 403.0 3.99 4.02 2.61 44.0 0.26 Good D VS1 58.4 63.0 403.0 4.19 4.24 2.46 45.0 0.32 Good H SI2 63.1 56.0 403.0 4.34 4.37 2.75 46.0 0.29 Premium F SI1 62.4 58.0 403.0 4.24 4.26 2.65 47.0 0.32 Very Good H SI2 61.8 55.0 403.0 4.35 4.42 2.71 48.0 0.32 Good H SI2 63.8 56.0 403.0 4.36 4.38 2.79 49.0 0.25 Very Good E VS2 63.3 60.0 404.0 4.0 4.03 2.54 50.0 0.29 Very Good H SI2 60.7 60.0 404.0 4.33 4.37 2.64 51.0 0.24 Very Good F SI1 60.9 61.0 404.0 4.02 4.03 2.45 52.0 0.23 Ideal G VS1 61.9 54.0 404.0 3.93 3.95 2.44 53.0 0.32 Ideal I SI1 60.9 55.0 404.0 4.45 4.48 2.72 54.0 0.22 Premium E VS2 61.6 58.0 404.0 3.93 3.89 2.41 55.0 0.22 Premium D VS2 59.3 62.0 404.0 3.91 3.88 2.31 56.0 0.3 Ideal I SI2 61.0 59.0 405.0 4.3 4.33 2.63 57.0 0.3 Premium J SI2 59.3 61.0 405.0 4.43 4.38 2.61 58.0 0.3 Very Good I SI1 62.6 57.0 405.0 4.25 4.28 2.67 59.0 0.3 Very Good I SI1 63.0 57.0 405.0 4.28 4.32 2.71 60.0 0.3 Good I SI1 63.2 55.0 405.0 4.25 4.29 2.7 61.0 0.35 Ideal I VS1 60.9 57.0 552.0 4.54 4.59 2.78 62.0 0.3 Premium D SI1 62.6 59.0 552.0 4.23 4.27 2.66 63.0 0.3 Ideal D SI1 62.5 57.0 552.0 4.29 4.32 2.69 64.0 0.3 Ideal D SI1 62.1 56.0 552.0 4.3 4.33 2.68 65.0 0.42 Premium I SI2 61.5 59.0 552.0 4.78 4.84 2.96 66.0 0.28 Ideal G VVS2 61.4 56.0 553.0 4.19 4.22 2.58 67.0 0.32 Ideal I VVS1 62.0 55.3 553.0 4.39 4.42 2.73 68.0 0.31 Very Good G SI1 63.3 57.0 553.0 4.33 4.3 2.73 69.0 0.31 Premium G SI1 61.8 58.0 553.0 4.35 4.32 2.68 70.0 0.24 Premium E VVS1 60.7 58.0 553.0 4.01 4.03 2.44 71.0 0.24 Very Good D VVS1 61.5 60.0 553.0 3.97 4.0 2.45 72.0 0.3 Very Good H SI1 63.1 56.0 554.0 4.29 4.27 2.7 73.0 0.3 Premium H SI1 62.9 59.0 554.0 4.28 4.24 2.68 74.0 0.3 Premium H SI1 62.5 57.0 554.0 4.29 4.25 2.67 75.0 0.3 Good H SI1 63.7 57.0 554.0 4.28 4.26 2.72 76.0 0.26 Very Good F VVS2 59.2 60.0 554.0 4.19 4.22 2.49 77.0 0.26 Very Good E VVS2 59.9 58.0 554.0 4.15 4.23 2.51 78.0 0.26 Very Good D VVS2 62.4 54.0 554.0 4.08 4.13 2.56 79.0 0.26 Very Good D VVS2 62.8 60.0 554.0 4.01 4.05 2.53 80.0 0.26 Very Good E VVS1 62.6 59.0 554.0 4.06 4.09 2.55 81.0 0.26 Very Good E VVS1 63.4 59.0 554.0 4.0 4.04 2.55 82.0 0.26 Very Good D VVS1 62.1 60.0 554.0 4.03 4.12 2.53 83.0 0.26 Ideal E VVS2 62.9 58.0 554.0 4.02 4.06 2.54 84.0 0.38 Ideal I SI2 61.6 56.0 554.0 4.65 4.67 2.87 85.0 0.26 Good E VVS1 57.9 60.0 554.0 4.22 4.25 2.45 86.0 0.24 Premium G VVS1 62.3 59.0 554.0 3.95 3.92 2.45 87.0 0.24 Premium H VVS1 61.2 58.0 554.0 4.01 3.96 2.44 88.0 0.24 Premium H VVS1 60.8 59.0 554.0 4.02 4.0 2.44 89.0 0.24 Premium H VVS2 60.7 58.0 554.0 4.07 4.04 2.46 90.0 0.32 Premium I SI1 62.9 58.0 554.0 4.35 4.33 2.73 91.0 0.7 Ideal E SI1 62.5 57.0 2757.0 5.7 5.72 3.57 92.0 0.86 Fair E SI2 55.1 69.0 2757.0 6.45 6.33 3.52 93.0 0.7 Ideal G VS2 61.6 56.0 2757.0 5.7 5.67 3.5 94.0 0.71 Very Good E VS2 62.4 57.0 2759.0 5.68 5.73 3.56 95.0 0.78 Very Good G SI2 63.8 56.0 2759.0 5.81 5.85 3.72 96.0 0.7 Good E VS2 57.5 58.0 2759.0 5.85 5.9 3.38 97.0 0.7 Good F VS1 59.4 62.0 2759.0 5.71 5.76 3.4 98.0 0.96 Fair F SI2 66.3 62.0 2759.0 6.27 5.95 4.07 99.0 0.73 Very Good E SI1 61.6 59.0 2760.0 5.77 5.78 3.56 100.0 0.8 Premium H SI1 61.5 58.0 2760.0 5.97 5.93 3.66 101.0 0.75 Very Good D SI1 63.2 56.0 2760.0 5.8 5.75 3.65 Values for "cut": {} Fair Goed Good Ideal Premium {Very Good} n/a Values for "clarity": I1 IF SI1 SI2 VS1 VS2 VVS1 VVS2 50%: 2401.0 Extremes: 326.0 18823.0 Histogram: {326.0 25334} {2175.7 9328} {4025.4 7393} {5875.1 3878} {7724.8 2364} {9574.5 1745} {11424.2 1306} {13273.9 1002} {15123.6 863} {16973.3 726} Histogram: {100.0 0} {290.0 1368} {480.0 4672} {670.0 5332} {860.0 4069} {1050.0 2525} {1240.0 1742} {1430.0 988} {1620.0 1900} {1810.0 1610} Normalise by standard deviation: Extremes: -0.6438430557412449 2.657960519401242 Normalise by min/max: Extremes: 0.0 1.0 Extremes newData: 0.0 1.0 Information on the table: rows: 53939 columns: no double carat double cut string color string clarity string depth double table double price double x double y double z double Rows with cut = Ideal: 53939 Rows with cut = Ideal and carat > 0.5: 53939