Graphs with R and Neo4j

Agenda

Install
Connect to Neo4j
Cypher
Visualization
Graph Algorithms
Graph Visualization
Association Rules

Install

install.packages("RNeo4j")

library(RNeo4j)

Connect to Neo4j

graph = startGraph("http://localhost:7474/db/data/")

graph

< Graph > 
$version
[1] "2.3.0"

Cypher

query = "
MATCH (t:Tweet)-[:MENTIONS]->(u:User)
RETURN u.username AS username, COUNT(t) AS mentions
ORDER BY mentions DESC
LIMIT 5
"

data = cypher(graph, query)
data

       username mentions
1 hadleywickham       52
2 genetics_blog       48
3     Rbloggers       31
4       rstudio       29
5 lincolnmullen       22

Visualization

library(ggplot2)

Visualization

ggplot(data, aes(reorder(username, -mentions), mentions)) + 
  geom_bar(stat="identity")

Visualization

query = "
MATCH (h:Hashtag)-[:TAGS]->(t:Tweet)
WITH h, COUNT(t) AS tweets
ORDER BY tweets DESC LIMIT 10

WITH COLLECT(h) AS hash
UNWIND hash AS h1
UNWIND hash AS h2

MATCH (h1)-[:TAGS]->(:Tweet)<-[:TAGS]-(h2)
RETURN h1.name, h2.name, COUNT(*) AS weight
ORDER BY weight DESC
"

Visualization

hashtags = cypher(graph, query)
head(hashtags)

      h1.name     h2.name weight
1      rstats datascience     48
2 datascience      rstats     48
3     bigdata      rstats     41
4      rstats     bigdata     41
5     bigdata datascience     26
6 datascience     bigdata     26

Visualization

ggplot(hashtags, aes(h1.name, h2.name)) +
  geom_tile(aes(fill = weight)) + 
  scale_fill_gradient(low = "white", high = "red") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Graph Algorithms

library(igraph)

Graph Algorithms

query = "
MATCH (u1:User)-[:POSTS]->(:Tweet)-[:RETWEETS]->(t:Tweet),
      (u2:User)-[:POSTS]->(t)
RETURN u1.username AS from, u2.username AS to, COUNT(*) AS weight
"

edges = cypher(graph, query)
edges[1:5, ]

          from            to weight
1      STAT545      millerdl      1
2     tmickael hadleywickham      1
3   schmidtdav      RLangTip      1
4        rasbt hadleywickham      1
5 mike_scott88 hadleywickham      1

Graph Algorithms

ig = graph_from_data_frame(edges)
class(ig)

[1] "igraph"

Graph Algorithms

Who has the highest betweenness centrality in this subgraph?

\[betweenness(v) = \sum_{x \neq y \in V} \frac{\sigma_{xy}(v)}{\sigma_{xy}}\]

b = betweenness(ig)
b = sort(b, decreasing = TRUE)

b[1:5]

  hadleywickham      ArnoCandel         ucfagls groundwalkergmb 
             34               8               2               1 
        STAT545 
              0

Graph Algorithms

Who has the highest closeness centrality in this subgraph?

\[closeness(v) = \frac{1}{\sum_{y} d(v, y)}\]

c = closeness(ig)
c = sort(c, decreasing = TRUE)

c[1:5]

      recology_   DataScienceLA BigDataTweetBot           h2oai 
   3.585900e-05    3.585900e-05    3.564300e-05    3.543084e-05 
        STAT545 
   3.522119e-05

Graph Visualization

V(ig)$size = 4
V(ig)$color = "cyan"
V(ig)$label = NA

plot(ig, edge.arrow.size = 0.1)

Graph Visualization

clusters = cluster_edge_betweenness(ig)

V(ig)$color = clusters$membership

plot(ig, edge.arrow.size = 0.1)

Graph Visualization

library(visNetwork)

id = unique(c(edges$from, edges$to))
nodes = data.frame(id = id, label = id)

nodes[1:5, ]

            id        label
1      STAT545      STAT545
2     tmickael     tmickael
3   schmidtdav   schmidtdav
4        rasbt        rasbt
5 mike_scott88 mike_scott88

Graph Visualization

visNetwork(nodes, edges)

Graph Visualization

nodes$group = clusters$membership

visNetwork(nodes, edges)

Association Rules

library(arules)
library(tidyr)

Association Rules

\[ \{onions, potatoes\} \Rightarrow \{burgers\}\]

Association Rules

\[Lift(X \Rightarrow Y) = \frac{P(X \cap Y)}{P(X) \times P(Y)}\]

Association Rules

\[Lift(rstats \Rightarrow analytics) = \frac{P(rstats \cap analytics)}{P(rstats) \times P(analytics)}\]

query = "
MATCH (t:Tweet) WITH COUNT(*) AS total
MATCH (rstats:Hashtag {name:'rstats'}), 
      (ana:Hashtag {name:'analytics'})
WITH SIZE((rstats)-[:TAGS]->()) * 1.0 / total AS p_rstats,
     SIZE((ana)-[:TAGS]->()) * 1.0 / total AS p_ana,
     SIZE((rstats)-[:TAGS]->()<-[:TAGS]-(ana)) * 1.0 / total AS joint
RETURN joint / (p_rstats * p_ana) AS lift
"

cypher(graph, query)

       lift
1 0.7070568

Association Rules

query = "
MATCH (h:Hashtag)-[:TAGS]->(t:Tweet)
RETURN t.id, h.name AS hashtag
"

data = cypher(graph, query)
data[1:5, ]

                t.id hashtag
1 664963000126808064  rstats
2 664962472357593088  rstats
3 664960069033816064  rstats
4 664959329112453120  rstats
5 664958634615435264  rstats

Association Rules

data$present = TRUE
data = spread(data, hashtag, present, fill = FALSE)
data$t.id = NULL

data[1:5, 1:5]

  abdsc amazon amazondhanterasday2 analysis analytics
1 FALSE  FALSE               FALSE    FALSE     FALSE
2 FALSE  FALSE               FALSE    FALSE     FALSE
3 FALSE  FALSE               FALSE    FALSE     FALSE
4 FALSE  FALSE               FALSE    FALSE     FALSE
5 FALSE  FALSE               FALSE    FALSE     FALSE

Association Rules

data = as(data, "transactions")
data

transactions in sparse format with
 424 transactions (rows) and
 83 items (columns)

Association Rules

rules = apriori(data, parameter=list(support=0.05))

Parameter specification:
 confidence minval smax arem  aval originalSupport support minlen maxlen
        0.8    0.1    1 none FALSE            TRUE    0.05      1     10
 target   ext
  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

apriori - find association rules with the apriori algorithm
version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[83 item(s), 424 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [4 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].

Association Rules

d = inspect(sort(rules, by = "lift"))

d[1, c(1:3, 6)]

        lhs         rhs     lift
2 {bigdata} => {rstats} 1.021687

Agenda

Install

Connect to Neo4j

Cypher

Visualization

Visualization

Visualization

Visualization

Visualization

Graph Algorithms

Graph Algorithms

Graph Algorithms

Graph Algorithms

Graph Algorithms

Graph Visualization

Graph Visualization

Graph Visualization

Graph Visualization

Graph Visualization

Association Rules

Association Rules

Association Rules

Association Rules

Association Rules

Association Rules

Association Rules

Association Rules

Association Rules

Conclusion

Workflow

Questions?

@_nicolemargaret

github.com/nicolewhite/RNeo4j