查看: 4298|回复: 0

【R】《R语言与数据挖掘》第三章上机记录

字体大小: 正常放大

1178 主题	15 听众	1万积分

TA的每日心情

	开心 2023-7-31 10:17

签到天数: 198 天

[LV.7]常住居民III

自我介绍: 数学中国浅夏

电梯直达

1^#

发表于 2021-11-24 16:50 |只看该作者 |倒序浏览

|招呼Ta 关注Ta

                                                        【R】《R语言与数据挖掘》第三章上机记录
书籍：《R语言与数据挖掘》
#（1）查看数据集中CO2的变量名称，并将Treatment的名称更改为Treat
library(reshape)
CO2
CO2 <- rename(CO2,c(Treatment = "Treat"))

#（2）检验CO2中是否存在缺失值，若有，检测缺失值的位置并删除含有缺失值的行
> anyNA(CO2)
[1] FALSE
#检测所在行：complete.case(CO2) 删除：CO2[comeplete.case(CO2),]

#(3)对变量utake按从小到大和从大到小排序，并对数据集CO2按照uptake排序（从大到小和从小到大）
#篇幅问题删除部分输出数据
> sort(CO2$uptake,decreasing = TRUE) #从大到小
[1] 45.5 44.3 43.9 42.9 42.4 42.1 41.8 41.4 41.4 40.6 40.3 39.7
[13] 39.6 39.2 38.9 38.8 38.7 38.6 38.1 37.5 37.2 37.1 35.5 35.4
[25] 35.3 35.0 34.8 34.6 34.0 32.5 32.4 32.4 32.4 31.8 31.5 31.1
[37] 30.9 30.6 30.4 30.3 30.0 28.5 28.1 27.9 27.8 27.3 27.3 26.2
[49] 25.8 24.1 22.2 22.0 21.9 21.0 19.9 19.5 19.4 19.2 18.9 18.9
> sort(CO2$uptake,decreasing = FALSE)
[1]  7.7  9.3 10.5 10.6 10.6 11.3 11.4 12.0 12.3 12.5 13.0 13.6
[13] 13.7 14.2 14.4 14.9 15.1 16.0 16.2 17.9 17.9 17.9 18.0 18.1
[25] 18.9 18.9 19.2 19.4 19.5 19.9 21.0 21.9 22.0 22.2 24.1 25.8
[37] 26.2 27.3 27.3 27.8 27.9 28.1 28.5 30.0 30.3 30.4 30.6 30.9
[49] 31.1 31.5 31.8 32.4 32.4 32.4 32.5 34.0 34.6 34.8 35.0 35.3

> CO2[order(CO2$uptake),]
Plant       Type    Treat conc uptake
71 Mc2 Mississippi chilled 95 7.7
29 Qc2    Quebec chilled 95 9.3
64 Mc1 Mississippi chilled 95 10.5
43 Mn1 Mississippi nonchilled 95 10.6
78 Mc3 Mississippi chilled 95 10.6
57 Mn3 Mississippi nonchilled 95 11.3

> CO2[order(-CO2$uptake),]
Plant       Type    Treat conc uptake
21 Qn3    Quebec nonchilled 1000 45.5
14 Qn2    Quebec nonchilled 1000 44.3
20 Qn3    Quebec nonchilled  675 43.9
19 Qn3    Quebec nonchilled  500 42.9
35 Qc2    Quebec chilled 1000 42.4

#（4）将CO2随机分成两组数据，第一组和第二组比例为6：4
n <- sample(2,84,replace = TRUE,prob = c(0.6,0.4))
(sample1 <- CO2[n == 1,])
(sample2 <- CO2[n == 2,])

#（5）应用tapply()函数，计算不同植物（Plant）对应的uptake的平均值
tapply(CO2$uptake,CO2$Plant,mean)

#（6）应用aggegate()函数，计算不同植物（Plant）、不同类型（Type）对应的uptake的平均值
aggregate(CO2$uptake,by = list(CO2$Plant,CO2$Type),FUN = mean)

#（7）应用lapply()函数，同时计算con和uptake的均值
lapply(c(CO2$conc,CO2$uptake),mean)

#（8）使用grep()函数，查找出植物名称（Plant）中含有”Qn“的行的位置，并将这些行储存于变量Plant_Qn中
Plant_Qn <- grep("Qn",CO2$Plant,fixed = FALSE)
Plant_Qn

#（9）使用gsub()函数，将CO2中植物名称（Plant）中的字符串”Qn“改为”QN“

#编写函数stat，函数同时计算均值、最大值、最小值、标准差、峰度、偏度
#生成自由度为2的t分布的一百个随机数t，并通过stat函数计算……
gsub("[t]","t",CO2$Plant)

library(fBasics)
stat <- function(x)
{
  if(!is.numeric(x))
  {
stop("the input data must be numeric!\n")
  }
  if(length(x) == 1)
  {
stop("can not compute sd for one number!\n")
  }
  max1 <- max(x)
  min1 <- min(x)
  mean1 <- mean(x)
  skewness1 <- skewness(x)
  kurtosis1 <- kurtosis(x)
  answer <- c(max1,min1,mean1,skewness1,kurtosis1)
  return(answer)
}

t <- rt(100,2)
stat(t)

zan