R语言实战 - 基本数据管理,3

8. 数据排序

> leadership$age
[1] 32 45 25 39 NA
> newdata <- leadership[order(leadership$age),]
> newdata
  manager   testDate country gender age item1 item2 item3 item4 item5
3       3 2008-10-01      UK      F  25     3     5     5     5     2
1       1 2008-10-24      US      M  32     5     4     5     5     5
4       4 2008-10-12      UK      M  39     3     3     4    NA    NA
2       2 2008-10-28      US      F  45     3     5     2     5     5
5       5 2009-05-01      UK      F  NA     2     2     1     2     1
  stringAsFactors agecat
3           FALSE  Young
1           FALSE  Young
4           FALSE  Young
2           FALSE  Young
5           FALSE   <NA>
> 
> 
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:

    age, country, gender, manager

> newdata <- leadership[order(gender, age),]
> detach(leadership)
> newdata
  manager   testDate country gender age item1 item2 item3 item4 item5
3       3 2008-10-01      UK      F  25     3     5     5     5     2
2       2 2008-10-28      US      F  45     3     5     2     5     5
5       5 2009-05-01      UK      F  NA     2     2     1     2     1
1       1 2008-10-24      US      M  32     5     4     5     5     5
4       4 2008-10-12      UK      M  39     3     3     4    NA    NA
  stringAsFactors agecat
3           FALSE  Young
2           FALSE  Young
5           FALSE   <NA>
1           FALSE  Young
4           FALSE  Young
> 
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:

    age, country, gender, manager

> newdata <- leadership[order(gender, -age),]
> detach(leadership)
> newdata
  manager   testDate country gender age item1 item2 item3 item4 item5
5       5 2009-05-01      UK      F  NA     2     2     1     2     1
2       2 2008-10-28      US      F  45     3     5     2     5     5
3       3 2008-10-01      UK      F  25     3     5     5     5     2
4       4 2008-10-12      UK      M  39     3     3     4    NA    NA
1       1 2008-10-24      US      M  32     5     4     5     5     5
  stringAsFactors agecat
5           FALSE   <NA>
2           FALSE  Young
3           FALSE  Young
4           FALSE  Young
1           FALSE  Young
> 

9. 数据集的合并

9.1 添加列

> patientID <- c(1, 2, 3, 4)
> age <- c(25, 34, 28, 52)
> status <- c("poor", "improved", "excellent", "poor")
> gender <- c("F", "M", "M", "F")
> dataframeA <- data.frame(patientID, gender)
> dataframeA
  patientID gender
1         1      F
2         2      M
3         3      M
4         4      F
> dataframeB <- data.frame(patientID, age, status)
> dataframeB
  patientID age    status
1         1  25      poor
2         2  34  improved
3         3  28 excellent
4         4  52      poor
> total <- merge(dataframeA, dataframeB, by="ID")
Error in fix.by(by.x, x) : 'by' must specify a uniquely valid column
> total <- merge(dataframeA, dataframeB, by="patientID")
> total
  patientID gender age    status
1         1      F  25      poor
2         2      M  34  improved
3         3      M  28 excellent
4         4      F  52      poor
> total <- merge(dataframeA, dataframeB, by=c("gender", "age"))
Error in fix.by(by.x, x) : 'by' must specify a uniquely valid column
> total <- merge(dataframeA, dataframeB, by=c("patientID", "age"))
Error in fix.by(by.x, x) : 'by' must specify a uniquely valid column
> 
> total <- cbind(dataframeA, dataframeB)
> total
  patientID gender patientID age    status
1         1      F         1  25      poor
2         2      M         2  34  improved
3         3      M         3  28 excellent
4         4      F         4  52      poor
> 

9.2 添加行

> total <- rbind(dataframeA, dataframeB)
Error in rbind(deparse.level, ...) : 
  numbers of columns of arguments do not match

10. 数据集取子集

10.1 选入(保留)变量

> newdata <- leadership[, c(6:10)]
> newdata
  item1 item2 item3 item4 item5
1     5     4     5     5     5
2     3     5     2     5     5
3     3     5     5     5     2
4     3     3     4    NA    NA
5     2     2     1     2     1
> 
> 
> myvars <- c("item1","item2","item3","item4","item5")
> newdata <- leadership[myvars]
> newdata
  item1 item2 item3 item4 item5
1     5     4     5     5     5
2     3     5     2     5     5
3     3     5     5     5     2
4     3     3     4    NA    NA
5     2     2     1     2     1
> 
> 
> myvar <- paste("item", 1:5, seq="")
> myvar
[1] "item 1 " "item 2 " "item 3 " "item 4 " "item 5 "
> myvar <- paste("item", 1:5, sep="")
> myvar
[1] "item1" "item2" "item3" "item4" "item5"
> newdata <- leadership[myvars]
> newdata
  item1 item2 item3 item4 item5
1     5     4     5     5     5
2     3     5     2     5     5
3     3     5     5     5     2
4     3     3     4    NA    NA
5     2     2     1     2     1
> 

10.2 剔除(丢弃)变量

> myvars <- names(leadership) %in% c("item3", "item4")
> myvars
 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
> newdata <- leadership[!myvars]
> newdata
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
3       3 2008-10-01      UK      F  25     3     5     2           FALSE
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
5       5 2009-05-01      UK      F  NA     2     2     1           FALSE
  agecat
1  Young
2  Young
3  Young
4  Young
5   <NA>
> 
> 
> names(leadership)
 [1] "manager"         "testDate"        "country"         "gender"         
 [5] "age"             "item1"           "item2"           "item3"          
 [9] "item4"           "item5"           "stringAsFactors" "agecat"         
> 
> newdata <- leadership[c(-8,-9)]
> newdata
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
3       3 2008-10-01      UK      F  25     3     5     2           FALSE
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
5       5 2009-05-01      UK      F  NA     2     2     1           FALSE
  agecat
1  Young
2  Young
3  Young
4  Young
5   <NA>
> leadership$item3 <- leadership$item4 <- NULL
> leadership
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
3       3 2008-10-01      UK      F  25     3     5     2           FALSE
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
5       5 2009-05-01      UK      F  NA     2     2     1           FALSE
  agecat
1  Young
2  Young
3  Young
4  Young
5   <NA>
> 

10.3 选入观测

> newdata <- leadership[1:3,]
> newdata
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
3       3 2008-10-01      UK      F  25     3     5     2           FALSE
  agecat
1  Young
2  Young
3  Young
> newdata <- leadership[which(leadership$gender=="M" & leadership$age > 30),]
> newdata
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
  agecat
1  Young
4  Young
> attach(leadership)
The following objects are masked _by_ .GlobalEnv:

    age, country, gender, manager

> newdata1 <- leadership[which(gender=='M' & age > 30),]
> detach(leadership)
> newdata1
  manager   testDate country gender age item1 item2 item5 stringAsFactors
2       2 2008-10-28      US      F  45     3     5     5           FALSE
  agecat
2  Young
> 
> leadership$date <- as.Date(leadership$date, "%m/%d/%y")
Error in as.Date.default(leadership$date, "%m/%d/%y") : 
  do not know how to convert 'leadership$date' to class “Date”
> leadership$testDate <- as.Date(leadership$testDate, "%m/%d/%y")
> startdate <- as.Date("2009-01-01")
> enddate <- as.Date("2009-10-31")
> newdate <- leadership[which(leadership$testDate >= startdate & leadership$testDate <= enddate),]
> newdate
  manager   testDate country gender age item1 item2 item5 stringAsFactors
5       5 2009-05-01      UK      F  NA     2     2     1           FALSE
  agecat
5   <NA>
> 

10.4 subset() 函数

> leadership
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
3       3 2008-10-01      UK      F  25     3     5     2           FALSE
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
5       5 2009-05-01      UK      F  NA     2     2     1           FALSE
  agecat
1  Young
2  Young
3  Young
4  Young
5   <NA>
> newdata <- subset(leadership, age >= 35 | age < 24, select=c(item1, item2, item5))
> newdata
  item1 item2 item5
2     3     5     5
4     3     3    NA
> 
> newdata <- subset(leadership, gender=="M" & age > 25, select=gender:item5)
> newdata
  gender age item1 item2 item5
1      M  32     5     4     5
4      M  39     3     3    NA
> 

10.5 随机抽样

> leadership
  manager   testDate country gender age item1 item2 item5 stringAsFactors
1       1 2008-10-24      US      M  32     5     4     5           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
3       3 2008-10-01      UK      F  25     3     5     2           FALSE
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
5       5 2009-05-01      UK      F  NA     2     2     1           FALSE
  agecat
1  Young
2  Young
3  Young
4  Young
5   <NA>
> 
> mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]
> mysample
  manager   testDate country gender age item1 item2 item5 stringAsFactors
4       4 2008-10-12      UK      M  39     3     3    NA           FALSE
2       2 2008-10-28      US      F  45     3     5     5           FALSE
1       1 2008-10-24      US      M  32     5     4     5           FALSE
  agecat
4  Young
2  Young
1  Young
>