Skip to Main Content
Florida Tech Evans Library Logo

Introduction to Text Mining

An overview of text mining tools and techniques.

str_c(string, sep, collapse = NULL)

string_c(string, sep, collapse = NULL) will combine multiple strings into a single string. 

Arguments: 

string: the string objects you want to combine into a single string

sep: a pattern you would like to insert between the strings in the combined strings. We recommend using sep = " " to ensure there is a space between the first string and the second. 

collapse = NULL: collapse strings in a character vector into a single string. 

# This will get executed each time the exercise gets initialized library(stringr) play = c("Hamlet", "Romeo & Juliette", "Romeo & Julliette", "The Merchant of Venice", "King Henry IV", "Julius Ceasar", "MacBeth", "King Lear", "Richard III", "Julius Ceasar", "Othello", "Twelfth Night", "Hamlet", "Timon of Athens", "King Lear", "As You Like It", "Measure for Measure", "Twelfth Night") quote = c("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals", "What's in a name? That which we call a rose by any other name would smell as sweet", "Tempt not a desperate man", "The devil can cite Scripture for his purpose", "A man can die but once", "But, for my own part, it was Greek to me", "Double, double toil and trouble; Fire burn, and cauldron bubble", "Nothing will come of nothing", "Now is the winter of our discontent", "Cowards die many times before their deaths; the valiant never taste of death but once", "I am one who loved not wisely but too well", "If music be the food of love play on", "We know what we are, but know not what we may be", "We have seen better days", "I am a man more sinned against than sinning", "All the world's a stage, And all the men and women merely players: They have their exits and their entrances; And one man in his time plays many parts", "Some rise by sin, and some by virtue fall", "Some are born great, some achieve greatness, and some have greatness thrust upon them" ) shakespeare = data.frame(play, quote) y = "the quick" z = "brown fox" # objects y and z have been preloaded into the workspace. # The code below will show you the contents of y and z. print(y) print(z) # Use str_c(var1, var2, sep = "", collapse = NULL) to join the strings. Note, there is no space between the quotes in the 'sep' argument. # Now use a space between the quotes in the 'sep' argument. # objects y and z have been preloaded into the workspace. # The code below will show you the contents of y and z. print(y) print(z) # Use str_c(var1, var2, sep = "", collapse = NULL) to join the strings. Note, there is no space between the quotes in the 'sep' argument. str_c(y, z, sep = "", collapse = NULL) # Now use a space between the quotes in the 'sep' argument. str_c(y, z, sep = " ", collapse = NULL) test_function("str_c") success_msg("Great job! Make sure you're careful with the argument when joining strings." )
Use(str_c(y, z, sep = "", collapse = NULL)) then (str_c(y, z, sep = " ", collapse = NULL))

str_dup(string, times)

str(dup, n) will duplicate a string 'n' times where 'n' is a digit specified by the user. 

# This will get executed each time the exercise gets initialized library(stringr) play = c("Hamlet", "Romeo & Juliette", "Romeo & Julliette", "The Merchant of Venice", "King Henry IV", "Julius Ceasar", "MacBeth", "King Lear", "Richard III", "Julius Ceasar", "Othello", "Twelfth Night", "Hamlet", "Timon of Athens", "King Lear", "As You Like It", "Measure for Measure", "Twelfth Night") quote = c("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals", "What's in a name? That which we call a rose by any other name would smell as sweet", "Tempt not a desperate man", "The devil can cite Scripture for his purpose", "A man can die but once", "But, for my own part, it was Greek to me", "Double, double toil and trouble; Fire burn, and cauldron bubble", "Nothing will come of nothing", "Now is the winter of our discontent", "Cowards die many times before their deaths; the valiant never taste of death but once", "I am one who loved not wisely but too well", "If music be the food of love play on", "We know what we are, but know not what we may be", "We have seen better days", "I am a man more sinned against than sinning", "All the world's a stage, And all the men and women merely players: They have their exits and their entrances; And one man in his time plays many parts", "Some rise by sin, and some by virtue fall", "Some are born great, some achieve greatness, and some have greatness thrust upon them" ) shakespeare = data.frame(play, quote) apples = "Red delicious, Honeycrisp, Gala" # str_dup(string, n) will duplicate a string 'n' times). # The 'apples' variable has been preloaded into the workspace. Try repeating apples 3 times. apples = "Red delicious, Honeycrisp, Gala" # str_dup(string, n) will duplicate a string 'n' times). # The 'apples' variable has been preloaded into the workspace. Try repeating apples 3 times. apples = "Red delicious, Honeycrisp, Gala" str_dup(apples, 3) test_function("str_dup") success_msg("Awesome! Notice how str_dup formats the strings to run into each other. Avoid this when you need to duplicate by placing a space at the end of the string. ")
Use(str_dup(apples, 3))

str_split_fixed(string, pattern, n)

str_split(string, pattern, n) will split a string into 'n' substrings based on a pattern specified by the user. The output will be a list object. Note: You can use str_split_fixed() instead to return the output in the form of matrix object rather than a list. 

# This will get executed each time the exercise gets initialized library(stringr) play = c("Hamlet", "Romeo & Juliette", "Romeo & Julliette", "The Merchant of Venice", "King Henry IV", "Julius Ceasar", "MacBeth", "King Lear", "Richard III", "Julius Ceasar", "Othello", "Twelfth Night", "Hamlet", "Timon of Athens", "King Lear", "As You Like It", "Measure for Measure", "Twelfth Night") quote = c("What a piece of work is man! how noble in reason! how infinite in faculty! in form and moving how express and admirable! in action how like an angel! in apprehension how like a god! the beauty of the world, the paragon of animals", "What's in a name? That which we call a rose by any other name would smell as sweet", "Tempt not a desperate man", "The devil can cite Scripture for his purpose", "A man can die but once", "But, for my own part, it was Greek to me", "Double, double toil and trouble; Fire burn, and cauldron bubble", "Nothing will come of nothing", "Now is the winter of our discontent", "Cowards die many times before their deaths; the valiant never taste of death but once", "I am one who loved not wisely but too well", "If music be the food of love play on", "We know what we are, but know not what we may be", "We have seen better days", "I am a man more sinned against than sinning", "All the world's a stage, And all the men and women merely players: They have their exits and their entrances; And one man in his time plays many parts", "Some rise by sin, and some by virtue fall", "Some are born great, some achieve greatness, and some have greatness thrust upon them" ) shakespeare = data.frame(play, quote) y = "the quick brown fox" # str_split(string, pattern, n) will split a string n times based on a pattern. # Try using str_split() on the 'y' variable to split the string at the spaces 2 times. y = "the quick brown fox" # str_split(string, pattern, n) will split a string n times based on a pattern. # Try using str_split() on the 'y' variable to split the string at the spaces 2 times. y = "the quick brown fox" str_split(y, " ", 3) test_function("str_split") success_msg("Yes! Notice how specifying 3 splits leaves the words 'brown fox' together in the final string. Changing the final argument to 'n=4' would separate those words.")
Use(str_split(y, " ", 3))