Agenda

  • RNeo4j
  • igraph
  • d3Network
  • visNetwork
  • Shiny

RNeo4j

library(RNeo4j)

graph = startGraph("http://localhost:7474/db/data/")
summary(graph)
##      This       To  That
## 1 Hashtag     TAGS Tweet
## 2    User    POSTS Tweet
## 3   Tweet MENTIONS  User
## 4   Tweet RETWEETS Tweet
## 5   Tweet REPLY_TO Tweet

RNeo4j

Retrieve Cypher query results into a data.frame with cypher.

query = "
MATCH (:Tweet)-[:MENTIONS]->(u:User)
RETURN u.username, COUNT(*) AS mentions
ORDER BY mentions DESC
LIMIT 5
"

cypher(graph, query)
##        u.username mentions
## 1   hadleywickham       43
## 2        RLangTip        6
## 3 _nicolemargaret        6
## 4        hrbrmstr        6
## 5       megcevans        5

igraph

library(igraph)

igraph

Extract an edgelist with a Cypher query.

query = "
MATCH (h1:Hashtag)-[:TAGS]->(:Tweet)<-[:TAGS]-(h2:Hashtag)
WHERE h1.name <> 'rstats' AND h2.name <> 'rstats'
RETURN h1.name, h2.name, COUNT(*) AS weight
"

edges = cypher(graph, query)
head(edges)
##   h1.name     h2.name weight
## 1  python      hadoop      1
## 2  hadoop     bigdata      2
## 3  python datascience      1
## 4   spark     chicago      2
## 5 bigdata    earl2105      1
## 6 bigdata datascience      7

igraph

Read the edgelist into igraph with graph_from_data_frame and plot with plot.

ig = graph_from_data_frame(edges, directed=F)
plot(ig)

igraph

Let's set some options.

plot(ig, vertex.color="cyan", vertex.label=NA)

igraph

Let's make the size of the nodes a function of some graph metric, e.g. betweenness.

sort(betweenness(ig), decreasing = T)[1:5]
## datascience      python     bigdata     dataviz      hadoop 
##    170.3333    104.3333     67.5000     45.0000     37.5000

igraph

plot(ig, vertex.color="cyan", vertex.label=NA,
     vertex.size=betweenness(ig) / (max(betweenness(ig) * .1)))

igraph

igraph has several community-detection algorithms.

clusters = cluster_edge_betweenness(ig)
clusters[1:3]
## $`1`
##  [1] "python"     "hadoop"     "spark"      "bigdata"    "yarn"      
##  [6] "techcon"    "chicago"    "earl2105"   "spss"       "statistics"
## [11] "boston"     "opensource"
## 
## $`2`
## [1] "digitalhumanities" "dh"               
## 
## $`3`
## [1] "measure"         "datascience"     "earl2015"        "machinelearning"
## [5] "r"               "london"          "adobeanalytics"  "omniture"       
## [9] "microsoft"

igraph

We can incorporate these into visualizations easily.

plot(ig, vertex.color=clusters$membership, vertex.label=NA)

d3Network

library(d3Network)

d3Network

We can use the same edgelist with d3SimpleNetwork.

d3SimpleNetwork(edges, iframe=T)

d3Network

d3ForceNetwork requires both a nodes data.frame and an edges data.frame. We already have the edges data.frame, and we can extract a nodes data.frame out of the edgelist.

nodes = data.frame(name=unique(c(edges$h1.name, edges$h2.name)))

head(nodes)
##                name
## 1            python
## 2            hadoop
## 3             spark
## 4           bigdata
## 5 digitalhumanities
## 6              yarn

d3Network

We also need an edges data.frame that refers to the indexed position of the node in the nodes data.frame.

for(i in 1:nrow(edges)) {
  from = edges$h1.name[i]
  to = edges$h2.name[i]
  
  edges$source[i] = which(nodes$name == from)
  edges$target[i] = which(nodes$name == to)
}

d3Network

Despite indexes starting at 1 in R, d3ForceNetwork expects indexing to start at 0.

edges$source = edges$source - 1
edges$target = edges$target - 1

head(edges)
##   h1.name     h2.name weight source target
## 1  python      hadoop      1      0      1
## 2  hadoop     bigdata      2      1      3
## 3  python datascience      1      0      9
## 4   spark     chicago      2      2     14
## 5 bigdata    earl2105      1      3     18
## 6 bigdata datascience      7      3      9

d3Network

We're finally ready to plot!

d3ForceNetwork(edges, nodes, Source="source", Target="target",
               iframe=T, zoom=T)

d3Network

Let's add the cluster membership that was determined earlier in igraph.

nodes$group = clusters$membership

head(nodes)
##                name group
## 1            python     1
## 2            hadoop     1
## 3             spark     1
## 4           bigdata     1
## 5 digitalhumanities     2
## 6              yarn     1

d3Network

d3ForceNetwork(edges, nodes, Source="source", Target="target",
               Group="group", NodeID="name", iframe=T, zoom=T)

visNetwork

library(visNetwork)

visNetwork

For the sake of variety, let's get a different edgelist out of Neo4j. Let's say we're interested in users mentioning other users.

query = "
MATCH (u1:User)-[:POSTS]->(:Tweet)-[:MENTIONS]->(u2:User)
RETURN u1.username AS from, u2.username AS to, COUNT(*) AS weight
"

edges = cypher(graph, query)

head(edges)
##          from             to weight
## 1   adam_slez  hadleywickham      1
## 2   edXOnline      Microsoft      1
## 3    AmStatMN  hadleywickham      1
## 4    sjackman  hadleywickham      3
## 5 LeahAWasser        NEONInc      1
## 6   djpappano nyuprimatology      1

visNetwork

visNetwork requires a nodes data.frame separate from the edges data.frame.

nodes = data.frame(id=unique(c(edges$from, edges$to)))

head(nodes)
##            id
## 1   adam_slez
## 2   edXOnline
## 3    AmStatMN
## 4    sjackman
## 5 LeahAWasser
## 6   djpappano

visNetwork

We need a label column in the nodes data.frame for the node labels in the visualization.

nodes$label = nodes$id

head(nodes)
##            id       label
## 1   adam_slez   adam_slez
## 2   edXOnline   edXOnline
## 3    AmStatMN    AmStatMN
## 4    sjackman    sjackman
## 5 LeahAWasser LeahAWasser
## 6   djpappano   djpappano

visNetwork

visNetwork(nodes, edges)

visNetwork

Let's use igraph to gather some insights and supplement the visualization.

ig = graph_from_data_frame(edges, directed=T)

visNetwork

Again, we'll use cluster_edge_betweenness to determine communities.

clusters = cluster_edge_betweenness(ig)

length(clusters)
## [1] 38

visNetwork

To color nodes by their cluster assignment, we'll use the group column that visNetwork expects.

nodes$group = clusters$membership

head(nodes)
##            id       label group
## 1   adam_slez   adam_slez     1
## 2   edXOnline   edXOnline     1
## 3    AmStatMN    AmStatMN     1
## 4    sjackman    sjackman     1
## 5 LeahAWasser LeahAWasser     2
## 6   djpappano   djpappano     3

visNetwork

visNetwork(nodes, edges)

visNetwork

Let's make the node sizes a function of their betweenness.

nodes$value = betweenness(ig)

head(nodes)
##            id       label group value
## 1   adam_slez   adam_slez     1     0
## 2   edXOnline   edXOnline     1     8
## 3    AmStatMN    AmStatMN     1     0
## 4    sjackman    sjackman     1     0
## 5 LeahAWasser LeahAWasser     2     0
## 6   djpappano   djpappano     3     0

visNetwork

We can always determine who has the highest betweenness by sorting…

sort(betweenness(ig), decreasing=T)[1:5]
##     hrbrmstr     quominus     bearloga     abresler RTalkPodcast 
##           50           43           32           25           19

visNetwork

But it's more fun and insightful to determine them visually, as a visualization provides context.

visNetwork(nodes, edges)

Shiny

Let's build an interactive tool for visualizing hashtags.

query = "
MATCH (h1:Hashtag)-[:TAGS]->(:Tweet)<-[:TAGS]-(h2:Hashtag)
WHERE h1.name = {hashtag}
RETURN h1.name AS from, h2.name AS to, COUNT(*) AS value
"

Shiny

inputPanel(
  textInput("hashtag", label="Hashtag:", value="python")
)

renderVisNetwork({
  edges = cypher(graph, query, hashtag=input$hashtag)
  nodes = data.frame(id=unique(c(edges$from, edges$to)))
  nodes$label = nodes$id
  visNetwork(nodes, edges)
})

Shiny

RNeo4j

More Resources

  • github.com/nicolewhite/RNeo4j
  • nicolewhite.github.io
  • markneedham.com/blog/r