在R语言中使用Stringr进行字符串操作

今天来学习下R中字符串处理操作,主要是stringr包中的字符串处理函数的用法。

先导入stringr包,library(stringr),require(stringr),或者stringr::函数名;这几种方式都行。

我们先定义一个字符串和变量,在此基础上演示各个函数基本用法。

  1 library(stringr)
  2 animal<-c("cow","dog","sheep","goat","pig","monkey","cat","cat")
  3 str1<-"I love cat, cat cat !"
  4 str2<-"lovelovelove"
  5 
  6 str_detect(animal,"cow") #匹配到指定字符串返回True,否则返回False
  7 [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
  8 
  9 str_detect(str1,"love")
 10 [1] TRUE
 11 
 12 str_which(animal,"dog") #返回指定字符串位置
 13 [1] 2
 14 
 15 str_which(animal,"cat")
 16 [1] 7 8
 17 
 18 str_which(str2,"love") #连续重复字符只返回第一个
 19 [1] 1
 20 
 21 str_count(animal,"cat") #返回匹配次数
 22 [1] 0 0 0 0 0 0 1 1
 23 
 24 str_count(str1,"cat")
 25 [1] 3
 26 
 27 str_locate(animal,"cat") #返回匹配起始位置
 28      start end
 29 [1,]    NA  NA
 30 [2,]    NA  NA
 31 [3,]    NA  NA
 32 [4,]    NA  NA
 33 [5,]    NA  NA
 34 [6,]    NA  NA
 35 [7,]     1   3
 36 [8,]     1   3
 37 
 38 str_locate(str1,"cat")
 39     start end
 40 [1,]     8  10
 41 
 42 str_locate(str2,"love") #连续重复字符值返回第一个字符起始位置
 43      start end
 44 [1,]     1   4
 45 

二、子串提取

  1 str_sub(str1,1,3) # 后面两个参数为起始,结束位置
  2 [1] "I l"
  3 
  4 str_sub(str1,1) # 可以只跟起始位置,默认到结束位置
  5 [1] "I love cat, cat cat !"
  6 
  7 str_sub(str1,3)
  8 [1] "love cat, cat cat !"
  9 
 10 str_sub(str1,-5) #位置还可以为负数
 11 [1] "cat !"
 12 
 13 str_sub(str1,-5,-1)
 14 [1] "cat !"
 15 
 16 str_subset(str1,"a") #匹配到指定字符就返回整个字符串
 17 [1] "I love cat, cat cat !"
 18 
 19 str_subset(str1,"x") #匹配不到则返回空
 20 character(0)
 21 
 22 str_extract(str1,"cat") #返回第一个匹配到字符串
 23 [1] "cat"
 24 str_extract(str1,"ca")
 25 [1] "ca"
 26 
 27 str_extract_all(str1,"cat") #返回所有匹配到字符串  列表形式返回
 28 [[1]]
 29 [1] "cat" "cat" "cat"
 30 
 31 str_extract_all(str1,"[aoe]")   #返回所有匹配到字符串  列表形式返回
 32 [[1]]
 33 [1] "o" "e" "a" "a" "a"
 34 
 35 str_match(str1,"cat")  #返回第一个匹配到字符串  矩阵形式返回
 36      [,1]
 37 [1,] "cat"
 38 
 39 str_match_all(str1,"cat") #返回所有匹配到字符串  矩阵形式返回
 40 [[1]]
 41      [,1]
 42 [1,] "cat"
 43 [2,] "cat"
 44 [3,] "cat"
 45 
 46 str_match_all(str2,"love")
 47 [[1]]
 48      [,1]
 49 [1,] "love"
 50 [2,] "love"
 51 [3,] "love"
 52 
 53 str_match(str2,"love")
 54      [,1]
 55 [1,] "love"
 56 
 57 str_match_all(str1,"(I|cat)") #可以多个匹配,不过这个返回结果我没看懂
 58 [[1]]
 59      [,1]  [,2]
 60 [1,] "I"   "I"
 61 [2,] "cat" "cat"
 62 [3,] "cat" "cat"
 63 [4,] "cat" "cat"

三、字符串长度处理

  1 str_length(str2) # 返回字符串长度
  2 [1] 12
  3 
  4 str_length("good job !") # 空格也算一个字符长度
  5 [1] 10
  6 
  7 str_trunc(str2,4) #指定字符串替换成替他字符,
  8 [1] "l..."
  9 
 10 str_trunc(str2,4,ellipsis = "*") #ellipsis 指定替换符
 11 [1] "lov*"
 12 
 13 str_trunc(str2,width = 8,ellipsis = "#") #width指定长度,此处指前8个字符
 14 [1] "lovelov#"
 15 
 16 str_trunc(str2,width = 8,side = c("left"),ellipsis = "#") # side指定方向(right,center,left)
 17 [1] "#ovelove"
 18 
 19 str_trim("sssss\n") # 去掉字符串首尾空字符,换行,空格等;字符串内部空字符无法去除
 20 [1] "sssss"
 21 str_trim(" sssss\n")
 22 [1] "sssss"

四、字符串替换

  1 str1
  2 [1] "I love cat, cat cat !"
  3 
  4 str_sub(str1,1,6) #提取子串
  5 [1] "I love"
  6 
  7 str_sub(str1,1,6)<-"she love" #子串替换
  8 str1
  9 [1] "she love cat, cat cat !"
 10 
 11 str_sub(animal,1,1)<-"F" #向量替换也可以
 12 animal
 13 [1] "Fow"    "Fog"    "Fheep"  "Foat"   "Fig"    "Fonkey" "Fat"
 14 [8] "Fat"
 15 
 16 str1<-"I love cat, cat cat !"
 17 
 18 str_replace(str1,"cat","dog") #替换第一个匹配项
 19 [1] "I love dog, cat cat !"
 20 
 21 str_replace_all(str1,"cat","dog") # 替换所有匹配项
 22 [1] "I love dog, dog dog !"
 23 
 24 str_to_lower(str1) # 全部转为小写字母
 25 [1] "i love cat, cat cat !"
 26 
 27 str_to_upper(str1) # 全部转为大写字母
 28 [1] "I LOVE CAT, CAT CAT !"
 29 
 30 str_to_title(str1) # 单词首字母转为大写
 31 [1] "I Love Cat, Cat Cat !"
 32 
 33 str_to_title(str2)
 34 [1] "Lovelovelove"
 35 

五、字符串分割和连接

  1 str_c(str1,str2,sep="+") # 字符串连接
  2 [1] "I love cat, cat cat !+lovelovelove"
  3 
  4 str_c(animal,str2,sep="+") #向量一次连接字符串
  5 [1] "Fow+lovelovelove"    "Fog+lovelovelove"    "Fheep+lovelovelove"
  6 [4] "Foat+lovelovelove"   "Fig+lovelovelove"    "Fonkey+lovelovelove"
  7 [7] "Fat+lovelovelove"    "Fat+lovelovelove"
  8 
  9 str_c(animal,sep="",collapse = "+") # 向量字符串连接
 10 [1] "Fow+Fog+Fheep+Foat+Fig+Fonkey+Fat+Fat"
 11 
 12 str_dup(str1,2) #字符串重复,数字代表次数
 13 [1] "I love cat, cat cat !I love cat, cat cat !"
 14 str_dup(str2,3)
 15 [1] "lovelovelovelovelovelovelovelovelove"
 16 
 17 str_split_fixed(animal,"",n=2) #分割字符串,分隔符,n=分割份数,返回矩阵
 18      [,1] [,2]
 19 [1,] "F"  "ow"
 20 [2,] "F"  "og"
 21 [3,] "F"  "heep"
 22 [4,] "F"  "oat"
 23 [5,] "F"  "ig"
 24 [6,] "F"  "onkey"
 25 [7,] "F"  "at"
 26 [8,] "F"  "at"
 27 
 28 str_split_fixed(str2,"",n=4)
 29      [,1] [,2] [,3] [,4]
 30 [1,] "l"  "o"  "v"  "elovelove"
 31 
 32 str_split(str2,"",4) #  #分割字符串,分隔符,n=分割份数,返回列表
 33 [[1]]
 34 [1] "l"         "o"         "v"         "elovelove"
 35 
 36 str_glue("pi is {str1}") # 字符串连接变量,{}花括号内是系统变量
 37 pi is I love cat, cat cat !
 38 
 39 str_glue("pi is {pi}")
 40 pi is 3.14159265358979
 41 
 42 str_glue("log2(8) is {log2(8)}")
 43 log2(8) is 3
 44 
 45 str_glue_data(mtcars, "{rownames(mtcars)} has {hp} hp") #数据框或列表对应行连接字符串
 46 Mazda RX4 has 110 hp
 47 Mazda RX4 Wag has 110 hp
 48 Datsun 710 has 93 hp
 49 Hornet 4 Drive has 110 hp
 50 Hornet Sportabout has 175 hp
 51 Valiant has 105 hp
 52 
 53  str_glue_data(mtcars, "{rownames(mtcars)} has {hp*1000} hp") # 话可以做相应计算
 54 Mazda RX4 has 110000 hp
 55 Mazda RX4 Wag has 110000 hp
 56 Datsun 710 has 93000 hp
 57 Hornet 4 Drive has 110000 hp
 58 
 59 str_glue_data(mtcars, "{rownames(mtcars)} has {substr(wt,1,2)} wt") # 子串分割
 60 Mazda RX4 has 2. wt
 61 Mazda RX4 Wag has 2. wt
 62 Datsun 710 has 2. wt
 63 Hornet 4 Drive has 3. wt

六、字符串排序

  1 str2
  2 [1] "lovelovelove"
  3 str_order(str2,decreasing = T) # 返回字符串下标
  4 [1] 1
  5 
  6 animal
  7 [1] "Fow"    "Fog"    "Fheep"  "Foat"   "Fig"    "Fonkey" "Fat"
  8 [8] "Fat"
  9 animal[str_order(animal,decreasing = T)]
 10 [1] "Fow"    "Fonkey" "Fog"    "Foat"   "Fig"    "Fheep"  "Fat"
 11 [8] "Fat"
 12 
 13 animal
 14 [1] "Fow"    "Fog"    "Fheep"  "Foat"   "Fig"    "Fonkey" "Fat"
 15 [8] "Fat"
 16 str_sort(animal) #直接对向量字符串排序
 17 [1] "Fat"    "Fat"    "Fheep"  "Fig"    "Foat"   "Fog"    "Fonkey"
 18 [8] "Fow"
 19