PyData Berlin 2016
Edouard Fouché
IBM Research & Development

Out-of-core: Data does not fit in the main memory
Advantageous for several reasons:
Task: Compute the mean for each column given the class in the IRIS data set
SELECT AVG(PETALLENGTH), AVG(PETALWIDTH), AVG(SEPALLENGTH), AVG(SEPALWIDTH) FROM IRIS GROUP BY SPECIES
from ibmdbpy.sampledata import iris
iris.groupby("species").mean()
| sepal_length | sepal_width | petal_length | petal_width | |
|---|---|---|---|---|
| species | ||||
| setosa | 5.006 | 3.428 | 1.462 | 0.246 |
| versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
| virginica | 6.588 | 2.974 | 5.552 | 2.026 |

However
We translate higher level syntax into SQL
We push them to the underlying database
Everything happens transparently

from ibmdbpy import IdaDataBase
idadb = IdaDataBase("DASHDB", verbose=True)
IdaDataBase instances are an abstraction layer for the connection to dashDB
from ibmdbpy import IdaDataFrame
iris = IdaDataFrame(idadb, "IRIS", indexer = "ID") # IRIS dataset
> SELECT distinct TABSCHEMA, TABNAME, OWNER, TYPE from SYSCAT.TABLES WHERE (OWNERTYPE = 'U') ORDER BY "TABSCHEMA","TABNAME" > SELECT TRIM(CURRENT_SCHEMA) FROM SYSIBM.SYSDUMMY1
iris.head()
> SELECT * FROM DB2INST1.IRIS ORDER BY "ID" ASC FETCH FIRST 5 ROWS ONLY
| ID | SEPALLENGTH | SEPALWIDTH | PETALLENGTH | PETALWIDTH | CLASS | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
iris.corr()
> SELECT COLNAME, TYPENAME FROM SYSCAT.COLUMNS WHERE TABNAME='IRIS' AND TABSCHEMA='DB2INST1' ORDER BY COLNO
> SELECT CORRELATION("SEPALLENGTH","SEPALWIDTH"), CORRELATION("SEPALLENGTH","PETALLENGTH"), CORRELATION("SEPALLENGTH","PETALWIDTH"), CORRELATION("SEPALWIDTH","PETALLENGTH"), CORRELATION("SEPALWIDTH","PETALWIDTH"), CORRELATION("PETALLENGTH","PETALWIDTH") FROM DB2INST1.IRIS
Execution time: 0.16952085494995117 seconds.
| SEPALLENGTH | SEPALWIDTH | PETALLENGTH | PETALWIDTH | |
|---|---|---|---|---|
| SEPALLENGTH | 1.000000 | -0.109369 | 0.871754 | 0.817954 |
| SEPALWIDTH | -0.109369 | 1.000000 | -0.420516 | -0.356544 |
| PETALLENGTH | 0.871754 | -0.420516 | 1.000000 | 0.962757 |
| PETALWIDTH | 0.817954 | -0.356544 | 0.962757 | 1.000000 |
iris.describe()
> SELECT COUNT("ID"), COUNT("SEPALLENGTH"), COUNT("SEPALWIDTH"), COUNT("PETALLENGTH"), COUNT("PETALWIDTH") FROM DB2INST1.IRIS
> SELECT AVG(CAST("ID" AS FLOAT)), AVG(CAST("SEPALLENGTH" AS FLOAT)), AVG(CAST("SEPALWIDTH" AS FLOAT)), AVG(CAST("PETALLENGTH" AS FLOAT)), AVG(CAST("PETALWIDTH" AS FLOAT)) FROM DB2INST1.IRIS
> SELECT COUNT("ID"), COUNT("SEPALLENGTH"), COUNT("SEPALWIDTH"), COUNT("PETALLENGTH"), COUNT("PETALWIDTH") FROM DB2INST1.IRIS
> SELECT STDDEV("ID")*(SQRT(150.0)/SQRT(149.0)), STDDEV("SEPALLENGTH")*(SQRT(150.0)/SQRT(149.0)), STDDEV("SEPALWIDTH")*(SQRT(150.0)/SQRT(149.0)), STDDEV("PETALLENGTH")*(SQRT(150.0)/SQRT(149.0)), STDDEV("PETALWIDTH")*(SQRT(150.0)/SQRT(149.0)) FROM DB2INST1.IRIS
> SELECT MIN("ID"), MIN("SEPALLENGTH"), MIN("SEPALWIDTH"), MIN("PETALLENGTH"), MIN("PETALWIDTH") FROM DB2INST1.IRIS
> SELECT * FROM (SELECT COUNT(*) AS "ID" FROM DB2INST1.IRIS WHERE "ID" IS NULL), (SELECT COUNT(*) AS "SEPALLENGTH" FROM DB2INST1.IRIS WHERE "SEPALLENGTH" IS NULL), (SELECT COUNT(*) AS "SEPALWIDTH" FROM DB2INST1.IRIS WHERE "SEPALWIDTH" IS NULL), (SELECT COUNT(*) AS "PETALLENGTH" FROM DB2INST1.IRIS WHERE "PETALLENGTH" IS NULL), (SELECT COUNT(*) AS "PETALWIDTH" FROM DB2INST1.IRIS WHERE "PETALWIDTH" IS NULL)
> SELECT CAST(COUNT(*) AS INTEGER) FROM DB2INST1.IRIS
> (SELECT "ID" AS "ID" FROM (SELECT ROW_NUMBER() OVER(ORDER BY "ID") as rn, "ID" FROM (SELECT * FROM DB2INST1.IRIS)) WHERE rn in(38,39,75,76,112,113))
> (SELECT "SEPALLENGTH" AS "SEPALLENGTH" FROM (SELECT ROW_NUMBER() OVER(ORDER BY "SEPALLENGTH") as rn, "SEPALLENGTH" FROM (SELECT * FROM DB2INST1.IRIS)) WHERE rn in(38,39,75,76,112,113))
> (SELECT "SEPALWIDTH" AS "SEPALWIDTH" FROM (SELECT ROW_NUMBER() OVER(ORDER BY "SEPALWIDTH") as rn, "SEPALWIDTH" FROM (SELECT * FROM DB2INST1.IRIS)) WHERE rn in(38,39,75,76,112,113))
> (SELECT "PETALLENGTH" AS "PETALLENGTH" FROM (SELECT ROW_NUMBER() OVER(ORDER BY "PETALLENGTH") as rn, "PETALLENGTH" FROM (SELECT * FROM DB2INST1.IRIS)) WHERE rn in(38,39,75,76,112,113))
> (SELECT "PETALWIDTH" AS "PETALWIDTH" FROM (SELECT ROW_NUMBER() OVER(ORDER BY "PETALWIDTH") as rn, "PETALWIDTH" FROM (SELECT * FROM DB2INST1.IRIS)) WHERE rn in(38,39,75,76,112,113))
> SELECT MAX("ID"), MAX("SEPALLENGTH"), MAX("SEPALWIDTH"), MAX("PETALLENGTH"), MAX("PETALWIDTH") FROM DB2INST1.IRIS
Execution time: 1.8747379779815674 seconds.
| ID | SEPALLENGTH | SEPALWIDTH | PETALLENGTH | PETALWIDTH | |
|---|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 75.500000 | 5.843333 | 3.054000 | 3.758667 | 1.198667 |
| std | 43.445368 | 0.828066 | 0.433594 | 1.764420 | 0.763161 |
| min | 1.000000 | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 38.500000 | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
| 50% | 75.500000 | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
| 75% | 112.500000 | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 150.000000 | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
iris = iris[["ID","SEPALLENGTH", "SEPALWIDTH"]]
iris['new'] = iris['SEPALLENGTH'] + iris['SEPALWIDTH'].mean()
> CREATE VIEW "TEMP_VIEW_IRIS_99527_1463749353" AS (SELECT "ID","SEPALLENGTH","SEPALWIDTH" FROM DB2INST1.IRIS)
<< AUTOCOMMIT >>
> DROP VIEW "TEMP_VIEW_IRIS_99527_1463749353"
<< AUTOCOMMIT >>
> SELECT AVG(CAST("SEPALWIDTH" AS FLOAT)) FROM (SELECT "SEPALWIDTH" FROM DB2INST1.IRIS)
Execution time: 0.14851880073547363 seconds.
iris.head()
> CREATE VIEW "TEMP_VIEW_IRIS_42406_1463749360" AS (SELECT "ID","SEPALLENGTH","SEPALWIDTH",("SEPALLENGTH" + 3.054) AS "new" FROM DB2INST1.IRIS)
<< AUTOCOMMIT >>
> DROP VIEW "TEMP_VIEW_IRIS_42406_1463749360"
<< AUTOCOMMIT >>
> SELECT * FROM (SELECT "ID","SEPALLENGTH","SEPALWIDTH",("SEPALLENGTH" + 3.054) AS "new" FROM DB2INST1.IRIS) ORDER BY "ID" ASC FETCH FIRST 5 ROWS ONLY
| ID | SEPALLENGTH | SEPALWIDTH | new | |
|---|---|---|---|---|
| 0 | 1 | 5.1 | 3.5 | 8.154 |
| 1 | 2 | 4.9 | 3.0 | 7.954 |
| 2 | 3 | 4.7 | 3.2 | 7.754 |
| 3 | 4 | 4.6 | 3.1 | 7.654 |
| 4 | 5 | 5.0 | 3.6 | 8.054 |
iris
<ibmdbpy.frame.IdaDataFrame at 0xa3d91d0>
iris.print()
SELECT "ID","SEPALLENGTH","SEPALWIDTH",("SEPALLENGTH" + 3.054) AS "new" FROM DB2INST1.IRIS
from ibmdbpy.learn import KMeans
kmeans = KMeans(3) # clustering with 3 clusters
iris = IdaDataFrame(idadb, "IRIS", indexer = "ID")
kmeans.fit_predict(iris).head()
> SELECT COUNT(*) FROM SYSCAT.ROUTINES WHERE ROUTINENAME='KMEANS' AND ROUTINEMODULENAME = 'IDAX'
> CALL IDAX.KMEANS ('idbased=False,model="KMEANS_31778_1463749495",k=3,distance="euclidean",randseed=12345,maxiter=5,intable=IRIS,id="ID"')
<< AUTOCOMMIT >>
<< COMMIT >>
> SELECT * FROM "DB2INST1"."KMEANS_31778_1463749495_MODEL"
> SELECT * FROM "DB2INST1"."KMEANS_31778_1463749495_COLUMNS"
> SELECT * FROM "DB2INST1"."KMEANS_31778_1463749495_COLUMN_STATISTICS"
> SELECT * FROM "DB2INST1"."KMEANS_31778_1463749495_CLUSTERS"
> CALL IDAX.PREDICT_KMEANS ('model="KMEANS_31778_1463749495",outtable="PREDICT_KMEANS_15239_1463749505",intable=IRIS,id="ID"')
<< AUTOCOMMIT >>
<< COMMIT >>
> SELECT distinct TABSCHEMA, TABNAME, OWNER, TYPE from SYSCAT.TABLES WHERE (OWNERTYPE = 'U') ORDER BY "TABSCHEMA","TABNAME"
> SELECT * FROM DB2INST1.PREDICT_KMEANS_15239_1463749505 ORDER BY "ID" ASC FETCH FIRST 5 ROWS ONLY
| ID | CLUSTER_ID | DISTANCE | |
|---|---|---|---|
| 0 | 1 | 1 | 0.332119 |
| 1 | 2 | 2 | 0.228125 |
| 2 | 3 | 2 | 0.138505 |
| 3 | 4 | 2 | 0.141902 |
| 4 | 5 | 1 | 0.382005 |
kmeans.describe()
KMeans clustering with 3 clusters of sizes 21, 32, 97
Cluster means:
CLUSTERID SEPALLENGTH SEPALWIDTH PETALLENGTH PETALWIDTH CLASS
0 1 5.209375 3.531250 1.671875 0.353125 setosa
1 2 4.695238 3.100000 1.395238 0.190476 setosa
2 3 6.301031 2.886598 4.958763 1.695876 virginica
Within cluster sum of squares by cluster:
[ 2.75907407 22.02570934 170.84912849]
> CALL IDAX.PRINT_MODEL ('model="KMEANS_31778_1463749495"')
<< AUTOCOMMIT >>
iris = IdaDataFrame(idadb, "IRIS_DISC", indexer="ID")
from ibmdbpy.feature_selection import info_gain
info_gain(iris)
> SELECT CAST(COUNT(*) AS INTEGER) FROM DB2INST1.IRIS_DISC > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALLENGTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALWIDTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALLENGTH","SEPALWIDTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "PETALLENGTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALLENGTH","PETALLENGTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "PETALWIDTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALLENGTH","PETALWIDTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "CLASS") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALLENGTH","CLASS") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALWIDTH","PETALLENGTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALWIDTH","PETALWIDTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "SEPALWIDTH","CLASS") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "PETALLENGTH","PETALWIDTH") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "PETALLENGTH","CLASS") > SELECT SUM(-a*LOG(a)) FROM(SELECT COUNT(*) AS a FROM DB2INST1.IRIS_DISC GROUP BY "PETALWIDTH","CLASS")
| SEPALLENGTH | SEPALWIDTH | PETALLENGTH | PETALWIDTH | CLASS | |
|---|---|---|---|---|---|
| SEPALLENGTH | NaN | 0.163915 | 0.708674 | 0.675541 | 0.652284 |
| SEPALWIDTH | 0.163915 | NaN | 0.381823 | 0.397607 | 0.376050 |
| PETALLENGTH | 0.708674 | 0.381823 | NaN | 1.346137 | 1.356545 |
| PETALWIDTH | 0.675541 | 0.397607 | 1.346137 | NaN | 1.378403 |
| CLASS | 0.652284 | 0.376050 | 1.356545 | 1.378403 | NaN |
pip install ibmdbpy
Any questions, suggestions ?