This tutorial shows how to use Part-o-Speech-tagging (POS) with the openNLP package. The openNLP package relies on the rjava
package. For this to work properly, you need a version of Java installed (e.g. open-jdk) which matches your R-version w.r.t either the 32- or 64-bit installation. Also the JAVA_HOME
environment variable needs to be set, pointing to your Java installation directory.
We extract proper nouns (tag NNP for singular and tag NNPS for plural proper nouns) from paragraphs of president’s speeches.
options(stringsAsFactors = FALSE)
library(quanteda)
library(NLP)
# read suto paragraphs
textdata <- read.csv("data/sotu_paragraphs.csv", sep = ";", encoding = "UTF-8")
english_stopwords <- readLines("resources/stopwords_en.txt", encoding = "UTF-8")
# Create corpus object
sotu_corpus <- corpus(textdata$text, docnames = textdata$doc_id)
require(openNLP)
require(openNLPdata)
# openNLP annotator objects
sent_token_annotator <- openNLP::Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
annotator_pipeline <- Annotator_Pipeline(
sent_token_annotator,
word_token_annotator,
pos_tag_annotator
)
# function for annotation
annotateDocuments <- function(doc, pos_filter = NULL) {
doc <- as.String(doc)
doc_with_annotations <- NLP::annotate(doc, annotator_pipeline)
tags <- sapply(subset(doc_with_annotations, type=="word")$features, `[[`, "POS")
tokens <- doc[subset(doc_with_annotations, type=="word")]
if (!is.null(pos_filter)) {
res <- tokens[tags %in% pos_filter]
} else {
res <- paste0(tokens, "_", tags)
}
res <- paste(res, collapse = " ")
return(res)
}
# run annotation on a sample of the corpus
annotated_corpus <- lapply(texts(sotu_corpus)[1:10], annotateDocuments)
# Have a look into the first annotated documents
annotated_corpus[1]
## $`1`
## [1] "Fellow-Citizens_NNS of_IN the_DT Senate_NNP and_CC House_NNP of_IN Representatives_NNPS :_:"
annotated_corpus[2]
## $`2`
## [1] "I_PRP embrace_VBP with_IN great_JJ satisfaction_NN the_DT opportunity_NN which_WDT now_RB presents_VBZ itself_PRP of_IN congratulating_VBG you_PRP on_IN the_DT present_JJ favorable_JJ prospects_NNS of_IN our_PRP$ public_JJ affairs_NNS ._. The_DT recent_JJ accession_NN of_IN the_DT important_JJ state_NN of_IN North_NNP Carolina_NNP to_TO the_DT Constitution_NNP of_IN the_DT United_NNP States_NNP (_-LRB- of_IN which_WDT official_JJ information_NN has_VBZ been_VBN received_VBN )_-RRB- ,_, the_DT rising_VBG credit_NN and_CC respectability_NN of_IN our_PRP$ country_NN ,_, the_DT general_JJ and_CC increasing_VBG good_JJ will_NN toward_IN the_DT government_NN of_IN the_DT Union_NNP ,_, and_CC the_DT concord_NN ,_, peace_NN ,_, and_CC plenty_NN with_IN which_WDT we_PRP are_VBP blessed_VBN are_VBP circumstances_NNS auspicious_JJ in_IN an_DT eminent_JJ degree_NN to_TO our_PRP$ national_JJ prosperity_NN ._."
We annotate the first paragraphs of the corpus, extract proper nouns, also referred to as Named Entities (NEs) such as person names, locations etc., and compute significance of co-occurrence of them.
sample_corpus <- sapply(texts(sotu_corpus)[1:1000], annotateDocuments, pos_filter = c("NNP", "NNPS", ""))
# Binary term matrix
require(Matrix)
minimumFrequency <- 2
filtered_corpus <- corpus(sample_corpus)
binDTM <- filtered_corpus %>%
tokens(what = "fastestword") %>%
tokens_tolower() %>%
dfm() %>%
dfm_weight(scheme = "boolean")
# Matrix multiplication for cooccurrence counts
coocCounts <- t(binDTM) %*% binDTM
source("calculateCoocStatistics.R")
# Definition of a parameter for the representation of the co-occurrences of a concept
# Determination of the term of which co-competitors are to be measured.
coocTerm <- "spain"
coocs <- calculateCoocStatistics(coocTerm, binDTM, measure="LOGLIK")
print(coocs[1:20])
## united states catholic spanish florida government majesty buenos
## 51.88 47.78 35.82 35.61 32.14 30.02 28.27 26.62
## ayres madrid south america february east pensacola state
## 26.62 21.52 21.52 12.23 11.63 9.54 8.10 7.69
## floridas france amelia st
## 6.96 5.78 5.37 4.54
For German language support run
# install.packages("openNLPmodels.de", repos = "http://datacube.wu.ac.at")
# require("openNLPmodels.de")
annotateDocuments
in a way, that consecutive POS-tags get merged into a single token (e.g. “United_NNP States_NNP” becomes “United_States_NNP”).2020, Andreas Niekler and Gregor Wiedemann. GPLv3. tm4ss.github.io