This is an experiment on how to scrap and process a bibliography as well as the corresponding articles. I have used my own bibtex files using RefManageR as well as my own publications. Feel free to adapt this code to your need.

Bibtex parsing

Using RefManageR, we can easily parse our bibtex files.

library(RefManageR)

articlesBibtex <- '/home/lepennec/Latex/CV/Biblio/LePennec.bib'
talksBibtex <- '/home/lepennec/Latex/CV/Biblio/LePennecTalks.bib'

articlesBib <- ReadBib(articlesBibtex,check=FALSE)
talksBib <- ReadBib(talksBibtex,check=FALSE)

We may parse those outputs according to our needs.

library("data.table")
library("dplyr")
currentyear=2015

parseArticle <- function(article) {
  author <- list(as.list(format(article$author,include=c("given","family"))))
  title <- gsub("^ *|(?<= ) | *$", "",gsub("[\n]","",article$title),perl = TRUE)
  year <- ordered(x = article$year,levels=2000:currentyear)
  pdf <- NA
  if (!is.null(article$reprint)) {
      pdf <- paste("/home/lepennec/DropboxInria/Public/Reprint/",article$reprint,sep="")
    } else {
        if (!is.null(article$preprint))  {
           pdf <- paste("/home/lepennec/DropboxInria/Public/Preprint/",article$preprint,sep="")
        }
    }
  subject <- NA
  if (!is.null(article$subject)) {
    subject <- article$subject
  }
  pubtype <- NA
  if (!is.null(article$pubtype)) {
    pubtype <-article$pubtype
  }
  lang <- NA
  if (!is.null(article$lang)) {
    lang <- article$lang
  }
  
  out = data.table("author" = author,
                    "title" = title,
                    "year" = year,
                    "pdf" = pdf,
                    "subject" = subject,
                    "pubtype" = pubtype,
                    "lang" = lang
                    )
  out
}

parseTalk <- function(article) {
  author <- NA
  if (!is.null(article$author)) {
    author <- list(as.list(format(article$author,include=c("given","family"))))
  }
  title <- NA
  if (!is.null(article$title)) {
    title <- gsub("^ *|(?<= ) | *$", "",gsub("[\n]","",article$title),perl = TRUE)
  }
  year <- ordered(x = article$year,levels=2000:currentyear)
  month <- ordered(x = article$month)
  subject <- NA
  if (!is.null(article$subject)) {
    subject <- article$subject
  }
  pubtype <- NA
  if (!is.null(article$pubtype)) {
    pubtype <-article$pubtype
  }
  lang <- NA
  if (!is.null(article$lang)) {
    lang <- article$lang
  }
  note <- NA
  if (!is.null(article$note)) {
    note <- article$note
  }
  location <- NA
  if (!is.null(article$fulllocation)) {
    location <- article$fulllocation
  }
  out = data.table("author" = author,
                   "title" = title,
                   "year" = year,
                   "month" = month,
                   "subject" = subject,
                   "pubtype" = pubtype,
                   "lang" = lang,
                   "note" = note,
                   "location" = location
  )
  out
}


articles <- rbindlist(lapply(articlesBib, parseArticle))
talks  <- rbindlist(lapply(talksBib, parseTalk))

Some statistics on the articles

Here are some experiments including ugly ones and a hack to avoid text clipping!

library("ggplot2")
ggplot(data = articles, aes(x = year)) + geom_bar(aes(fill = subject)) +
  scale_x_discrete(drop = FALSE)

ggplot(data = articles, aes(x = year))+geom_bar(aes(fill = pubtype)) +
  scale_x_discrete(drop=FALSE) +
   theme(axis.text.x = element_text(angle=70, vjust=1)) 

ggplot(data = articles, aes(x = subject)) + geom_bar(aes(fill = pubtype)) +
  scale_x_discrete(drop = FALSE) +
   theme(axis.text.x = element_text(angle=70, vjust=1))

q <- ggplot(data = articles, aes(x = subject)) + geom_bar(aes(fill = pubtype)) + scale_x_discrete(drop = FALSE) + coord_polar()  + 
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank()) +
  xlab("") + ylab("")
q
gt <- ggplot_gtable(ggplot_build(q))
gt$layout$clip[gt$layout$name=="panel"] <- "off"
grid::grid.draw(gt)

ggplot(data = articles, aes(x = factor(1), fill = subject)) + coord_polar(theta = "y") +
  geom_bar(color = "black", width = 1) + guides(fill = guide_legend(override.aes = list(colour = NA))) +
  theme(axis.title = element_blank(), axis.ticks = element_blank(), axis.text.y = element_blank())

stats <- articles[order(tolower(subject)), .N, by = subject]
q <- ggplot(data = articles, aes(x = factor(1), fill = subject)) +
  coord_polar(theta="y") + geom_bar(color = "black",width = 1) +
  guides(fill=guide_legend(override.aes=list(colour=NA))) +
  theme(axis.title = element_blank(), axis.ticks = element_blank(), axis.text.y = element_blank(), 
        legend.position = "none", axis.text = element_text(size = 16, colour = "black")) +
  scale_y_continuous(breaks = cumsum(stats[,N]) - stats[,N]/2, labels = stats[,subject]) + scale_x_discrete(expand = -.1)
q
gt <- ggplot_gtable(ggplot_build(q))
gt$layout$clip[gt$layout$name=="panel"] <- "off"
grid::grid.draw(gt)

Word clouds

An interesting way to visualize a corpus of documents is to display a word cloud. Except for the removal of a few unwanted words, this can be automatized thanks to the tm package.

library("tm")
library("wordcloud")
FixAccents <- function(x) {
      x <- gsub("’", " ", x)
      x <- gsub("´e", "é", x)
      x <- gsub("`e", "è", x)
      x <- gsub("ˆe", "ê", x)
      x <- gsub("`u", "ù", x)
      x <- gsub("ˆu", "û", x)
      x <- gsub("ˆi", "î", x)
      x <- gsub("`a", "à", x)
      x <- gsub("\f", " ", x)
      x
}

CountWordsPdf <- function(pdfSources) {
  articlesPdf <- Corpus(pdfSources, readerControl = list(reader = readPDF))
  articlesPdf <- tm_map(articlesPdf, content_transformer(FixAccents))
  articlesPdf <- tm_map(articlesPdf, removePunctuation)
  articlesPdf <- tm_map(articlesPdf, content_transformer(tolower))
  articlesPdf <- tm_map(articlesPdf, removeWords, stopwords("english"))
  articlesPdf <- tm_map(articlesPdf, removeWords, stopwords("french"))
  wordsToberemoved <- c("le", "pennec", "can", "one", "for", "will", "two", "now", "yields", "the", "also", "first", "this",
                        "use", "consider", "let", "for", "using", "thus", "section", "since", "obtain", "along",
                        "used", "following", "given", "assume", "log", "corresponding", "comme", "montre", "ainsi",
                        "chaque", "donc", "peut", "plus", "long", "obtenir", "être", "entre", "alors", "permet",
                        "figure", "tout", "partir", "deux", "non", "lemma", "assumption", "obtained", "result", "case",
                        "proof", "type", "exemple", "cas", "bien", "défini", "résultat", "βjξ", "jξγ", "sd1", "ξξj",
                        "\fψjξ", "ψjξ", "\fβjξ", "kli", "tjξγ", "kerkyacharian", "7647", "βbjξ", "see")
  articlesPdf <- tm_map(articlesPdf, removeWords, wordsToberemoved)
  articlesTDM <- TermDocumentMatrix(articlesPdf)
  articlesTDM <- removeSparseTerms(articlesTDM, .98)
  wordCounts <- sort(rowSums(as.matrix(articlesTDM)), decreasing = TRUE)
  wordCounts <- data.table(word = names(wordCounts),count = wordCounts)
  wordCounts
}

We can now apply our function to the full collection of texts.

pdfSources <- URISource(sprintf("file://%s", articles[!is.na(pdf), pdf]))
wordCounts <- CountWordsPdf(pdfSources)
pal2 <- brewer.pal(8, "Dark2")
minCount=wordCounts[50, count]
wordcloud(wordCounts[count >= minCount , word], wordCounts[count >= minCount, count],
          scale = c(4,.5), colors = pal2, random.order = FALSE)

We can also restrict our analysis to a sub collection.

pdfSources <- URISource(sprintf("file://%s", articles[!is.na(pdf) & lang == "Francais", pdf]))
wordCounts <- CountWordsPdf(pdfSources)
npal2 <- brewer.pal(8, "Dark2")
minCount=wordCounts[50, count]
wordcloud(wordCounts[count >= minCount, word], wordCounts[count >= minCount, count], 
          scale = c(4,.5), colors = pal2, random.order = FALSE)

pdfSources <- URISource(sprintf("file://%s", articles[!is.na(pdf) & subject == "Needlet", pdf]))
wordCounts <- CountWordsPdf(pdfSources)
npal2 <- brewer.pal(8, "Dark2")
minCount=wordCounts[25, count]
wordcloud(wordCounts[count >= minCount, word], wordCounts[count >= minCount, count], scale = c(4,.5), colors = pal2, random.order = FALSE)

Talk Map

Regarding the talks, we want to add the spatial information of the location of the talk. We can use the geocode function of ggmap to retrieve the coordinates of the talk location from a textual address.

library(ggmap)
coords <- geocode(talks[,location])
talks <- cbind(talks,coords)

It is then easy to plot those talks on a world or french map.

mapWorld <- borders("world", fill = "gray50")
ggplot(data = talks) + mapWorld + geom_jitter(aes(x = lon,y = lat, color = subject), 
                                              size = 4, position = position_jitter(width = 4,height = 4))

mapFrance <- borders("france", fill = "gray50")
ggplot(data=talks[location %like% "France",]) + mapFrance + 
  geom_jitter(aes(x = lon,y = lat, color = subject), size = 4, position = position_jitter(width = .15, height = .15))

Interactive map

We can go much further and provide an interactive map of those talks thanks to the leaflet package.

library(data.table)
library(leaflet)

makePopup <- Vectorize(function(title, author, note, month, year) {
  popup <- "<p>";
  if (!is.na(title)) {
    popup <- paste(popup, sprintf("<b>%s</b>", sanitizeLatexS(title)), sep = "")
    authors <- unlist(author)
    if (length(authors)>0) {
      popup <- paste(popup, "<br>", sep = "")
      for ( k in 1:length(authors)) {
        popup <- paste(popup,gsub(x = authors[k], "Erwan Le Pennec","ELP"), sep = "")
        if (k < length(authors)) popup <- paste(popup, " - ", sep = "")
      }
    }
    popup <- paste(popup, "</p><p>", sep = "")
  }
  
  if (!is.na(note)) {
    popup <- paste(popup, sprintf("%s</br>", sanitizeLatexS(note)), sep = "")
  }
  popup <- paste(popup, sprintf("%s/%s</p>", month, year),sep = "")
  
  popup                 
})

sanitizeLatexS <- function(x) { 
  x = gsub(x = x, '$\\ell_1$' ,'l1', fixed = TRUE);
  x = gsub(x = x, '{"}', '', fixed = TRUE);
  x = gsub(x = x, "[\n]", '');
  x }

talksleaflet <- talks[, .(lat, lon, popup = makePopup(title, author, note, month, year), subject)]

tmp <- unique(talksleaflet[["subject"]])
tmp <- tmp[!is.na(tmp)]

col <- RColorBrewer::brewer.pal(length(tmp), "Spectral")

for (i in seq_along(tmp)) {
  talksleaflet[subject == tmp[i], color := col[i]]
}

talksleaflet <- talksleaflet[! is.na(lat)]

m <- leaflet(data = talksleaflet) %>% addTiles(urlTemplate = 'http://{s}.tile.stamen.com/watercolor/{z}/{x}/{y}.png', 
                            attribution = 'Map tiles by <a href="http://stamen.com">Stamen Design</a>, <a href="http://creativecommons.org/licenses/by/3.0">CC BY 3.0</a> &mdash; Map data &copy; <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a>')

m <- addCircleMarkers(m, lng = ~lon, lat = ~lat, 
                      popup = ~popup, color = ~color,
                      clusterOptions = TRUE)
  
m <- m %>% addLegend(color = col, labels = tmp)
m