Data Management Quick Comparison 3

The following article provides a comparison between R/S-Plus and Python for basic data management and manipulation. 

Load Data

R/S-Plus

mydata <- read.csv("c:/customer.csv") 
mydata_lkup <- read.csv("c:/purchaseorder.csv") 

Python

 
import csv 
custDS=[] 
for row in csv.reader(open('c:/customer.csv'), delimiter=',', quotechar='"'): 
    custDS.append(row) 
poDS=[] 
for row in csv.reader(open('c:/purchaseorder.csv'), delimiter=',', quotechar='"'): 
    poDS.append(row)

Selecting- All Fields

R/S-Plus

 mydata 

Python

print(custDS)

One Field

R/S-Plus

  
 mydata$col1 

Python

  
for x in custDS:
    print x[1] 

Subset

R/S-Plus

 
subset(mydata, col2 == 18 )

Python

[x for x in custDS if x[2]=='18']

Sorting

R/S-Plus

 
mydata [order(mydata [,1]),] 

Python

 
sorted(custDS, key=lambda customer: customer[2])

Join

R/S-Plus

 
merge (mydata_2, mydata_lkup, by.x = col1 , by.y = col2 , all = TRUE ) 

Python

 
poDt={}
for row in csv.reader(open('c:/purchaseorder.csv'), delimiter=',', quotechar='"'):
    poDt[row[0]] = row[1:4]dsOut=[]
for x in custDS:
    if x[0] in poDt:
        x.extend(poDt[x[0]] )
        print(x)

 

Sample

R/S-Plus

head(mydata , 10)

Python

for i in [0,1,2]:
    print(poDS[i])

Aggregate Analysis

R/S-Plus

xtabs( ~ col2, mydata_lkup) 

Python

poCounts = {}
for row in poDS_sorted: 
    poCounts[row[1]] = poCounts.get(row[1],0) + 1
print(poCounts)

Unique

R/S-Plus

unique(mydata_lkup$col2)

Python

uniqpoList=[] [uniqpoList.append(i[1]) for i in poDS if not uniqpoList.count(i[1])] uniqpoList

The S-Language


The S programming language of statistical programming language was developed  Bell laboratories specifically for statistical modeling. There are two versions of  S.  One was developed by insightful under the name S-Plus.  The other is an open-source initiative called R.  S allows you to create objects and is very extendable and has power graphing capabilities.

Tips
Tip 1

Set Memory Size

memory.size(max = TRUE)
Tip 2

Today’s Date

Today <- format(Sys.Date(), %d %b %Y )
Tip 3

Set Working Directory

setwd( C:// )
Tip 4

Load In Data

ExampleData.path    <- file.path(getwd(), USDemographics.CSV ) 
ExampleData.FullSet  <- read.table( ExampleData.path, header=TRUE, sep= , , na.strings= NA , dec= . , strip.white=TRUE)
Tip 5

Split Data

ExampleData.Nrows <-  nrow(ExampleData.FullSet) ExampleData.NCol= ncol(ExampleData.FullSet) 
ExampleData.SampleSize <- ExampleData.Nrows /2
ExampleData.Sample <- sample(nrow(ExampleData.FullSet ),size = ExampleData.SampleSize ,
replace=FALSE, prob = NULL )
ExampleData.HoldBack  <- ExampleData.FullSet[ExampleData.Sample, c(5,1:ExampleData.NCol)]
ExampleData.Run   <- ExampleData.FullSet[-ExampleData.Sample, c(5,1:ExampleData.NCol)  ]
Tip 6

Create Function

Confusion <- function(a, b){
                  tbl <- table(a, b)
                  mis <- 1 - sum(diag(tbl))/sum(tbl)
                  list(table = tbl, misclass.prob = mis)
                   }
Tip 7

Recode Fields

ExampleData.FullSet$Savings 
ExampleData.FullSet$SavingsCat <- recode(ExampleData.FullSet$Savings, 
, -40000.00:-100.00 = HighNeg ; -100.00:-50.00  = MedNeg ; -50.00:10.00 = LowNeg ; 10.00:50.00 = Low ; 50.00:100.00 = Med ; 100.00:1000.00 = High ;;;  , as.factor.result=TRUE)
Tip 8

Summarize Data

Summary(ExampleData.FullSet)
Tip 9

Save output

save.image(file = c:/test.RData , version = NULL, ascii = FALSE,  compress = FALSE, safe = TRUE)
Tip 10

Subset

MyData.SubSample <- subset(MyData.Full, MyField ==0)
Tip 11

Remove Object From Memory

remove(list = c(‘MyObject’));
Tip  12

Create a Dataframe

TmpOuput <- data.frame ( Fields = c( Field1 , ‘Field2 , ‘Field3’),  Values   = c( 1 , 2 ,  2  ) )
Tip 13

Cut

data(swiss)
x <- swiss$Education  
swiss$Educated= cut(x, breaks=c(0, 11, 999), labels=c( 0 , 1 ))
Tip 14

Create Directories

dir.create( c:/MyProjects )

R/Splus

The S language’s power is not its data management capability nor is data management the intent of the S language.  However, often times when evaluating the output of a model you may need to perform basic data management with R/SPlus and you will find the S language acceptable in this role.  The commands will seem more similar to Unix/Linux than SQL. However, the S language has many of the benefits of Unix/Linux (a concise language for data management) while being more data centric (allowing meta data for dataframes which includes column names).
Example:
Customer
CustomerID Name Age
1 Joe 23
2 Mika 45
3 Lin 34
4 Sara 56
5 Susan 18
PurchaseOrder
POID CustomerID Purchase
1 3 Fiction
2 1 Biography
3 1 Fiction
4 2 Biography
5 3 Fiction
6 4 Fiction
 
     
 

SELECT

customer
CustomerID Name Age
1 Joe 23
2 Mika 45
3 Lin 34
4 Sara 56
5 Susan 18

ORDER BY

customer[order(customer[,2]),]
CustomerID Name Age
5 Susan 18
1 Joe 23
3 Lin 34
2 Mika 45
4 Sara 56

WHERE

subset(customer, custid == 5 )
CustomerID Name Age
5 Susan 18

INNER JOIN

merge (purchaseorder, customer, by.x = custid , by.y = custid , all = FALSE   )
CustomerID Name Age POID Purchase
1 Joe 23 2 Biography
1 Joe 23 3 Fiction
2 Mika 45 4 Biography
3 Lin 34 1 Fiction
3 Lin 34 5 Fiction
4 Sara 56 6 Fiction

LEFT OUTER JOIN

merge (purchaseorder, customer, by.x = custid , by.y = custid , all = TRUE   )
CustomerID Name Age POID Purchase
1 Joe 23 2 Biography
1 Joe 23 3 Fiction
2 Mika 45 4 Biography
3 Lin 34 1 Fiction
3 Lin 34 5 Fiction
4 Sara 56 6 Fiction
5 Susan 18 NULL NULL

GROUP BY

Cust_sum <- merge (purchaseorder, customer, by.x = custid , by.y = custid , all = FALSE   ) xtabs( ~ fname, cust_sum)
Joe Lin Mika Sara
2 2 1 1

UPDATE

customer[1,]$age <-23
 
CustomerID Name Age
1 Joe 26
2 Mika 45
3 Lin 34
4 Sara 56
5 Susan 18

INSERT

newcust <- data.frame(custid = 6, fname = Terry , age =50) rbind(newcust,customer)
 
CustomerID Name Age
1 Joe 23
2 Mika 45
3 Lin 34
4 Sara 56
5 Susan 18
6 Terry 50

DELETE

subset(customer, custid != 1 )
 
CustomerID Name Age
2 Mika 45
3 Lin 34
4 Sara 56
5 Susan 18

 

Example R Function

You can extend R/S-Plus usability by writing functions. This is similar to macros in the SAS Language. These functions can be anything from a new statistical algorithm to file operation to data manipulation. Below I give an example of a custom R macro. This function takes output from an rpart tree and converts it to SAS code suitable to use in a data a step. This is useful when coding nodes into a model. This dirty little secret: I developed the code by looking at the default print method for the rpart package and adapting it to generate SAS code. This code can also be modified to generate SQL code as well. When attempting to write new code I suggest first looking at published package that do something similar then try to adapt them to your own use.

The S language (which both R and SPlus use) is similar to C. There are many good editor for S. this code was written using TinnR.

printSAS.rpart <- function(x, minlength=0, spaces=2, cp,
digits=getOption( digits ), …) {

tree.depth <- getFromNamespace( tree.depth , rpart )

if(!inherits(x, rpart )) stop( Not legitimate rpart object )
if (!is.null(x$frame$splits)) x <- rpconvert(x) #help for old objects
if (!missing(cp)) x <- prune.rpart(x, cp=cp)
frame <- x$frame

ylevel <- attr(x, ylevels )
node <- as.numeric(row.names(frame))
depth <- tree.depth(node)
indent <- paste(rep( , spaces * 32), collapse = )

#32 is the maximal depth
if(length(node) > 1) {
indent <- substring(indent, 1, spaces * seq(depth))
# indent <- TT
indent <- paste(c( , indent[depth]), format(node), ) , sep = )
}
else indent <- paste(format(node), ) , sep = )

tfun <- (x$functions)$print
if (!is.null(tfun)) {
if (is.null(frame$yval2))
yval <- tfun(frame$yval, ylevel, digits)
else yval <- tfun(frame$yval2, ylevel, digits)
}
else yval <- format(signif(frame$yval, digits = digits))

 

z <- labels(x, digits=digits, minlength=minlength, …)

term <- rep( , length(depth))
final <- rep( , length(depth))
temp1 <- rep( , length(depth))
tempnode <- rep(10000, length(depth))
term[frame$var == <leaf> ] <- Terminal

for(i in 1:length(depth)) #print(1:i)
{
if(term[i] != Terminal )
{
final[i] <-
}

if(term[i] == Terminal )
{
for(j in 1:length(depth))
{
if (node[i – j] == 1) break
if(term[i – j] != Terminal )
{
if (node[i – j] != tempnode[i] -1)
{
if (node[i – j ] < tempnode[i])
{
temp1[i] <- paste ( z[i – j], And , temp1[i] )
tempnode[i] = node[i – j]
}
}
}

} # end for

final[i] <- paste ( If , temp1[i] , z[i], then NodeVal = , yval[i] , ; )
}# end if *
} # end Main loop
cat(final, sep = \n ) ## Print results
}