Agenda

  • Install
  • Connect to Neo4j
  • Cypher
  • Visualization
  • Graph Algorithms
  • Graph Visualization
  • Association Rules

Install

install.packages("RNeo4j")
library(RNeo4j)

Connect to Neo4j

graph = startGraph("http://localhost:7474/db/data/")

graph
< Graph > 
$version
[1] "2.3.0"

Cypher

query = "
MATCH (t:Tweet)-[:MENTIONS]->(u:User)
RETURN u.username AS username, COUNT(t) AS mentions
ORDER BY mentions DESC
LIMIT 5
"

data = cypher(graph, query)
data
       username mentions
1 hadleywickham       52
2 genetics_blog       48
3     Rbloggers       31
4       rstudio       29
5 lincolnmullen       22

Visualization

library(ggplot2)

Visualization

ggplot(data, aes(reorder(username, -mentions), mentions)) + 
  geom_bar(stat="identity")

Visualization

query = "
MATCH (h:Hashtag)-[:TAGS]->(t:Tweet)
WITH h, COUNT(t) AS tweets
ORDER BY tweets DESC LIMIT 10

WITH COLLECT(h) AS hash
UNWIND hash AS h1
UNWIND hash AS h2

MATCH (h1)-[:TAGS]->(:Tweet)<-[:TAGS]-(h2)
RETURN h1.name, h2.name, COUNT(*) AS weight
ORDER BY weight DESC
"

Visualization

hashtags = cypher(graph, query)
head(hashtags)
      h1.name     h2.name weight
1      rstats datascience     48
2 datascience      rstats     48
3     bigdata      rstats     41
4      rstats     bigdata     41
5     bigdata datascience     26
6 datascience     bigdata     26

Visualization

ggplot(hashtags, aes(h1.name, h2.name)) +
  geom_tile(aes(fill = weight)) + 
  scale_fill_gradient(low = "white", high = "red") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Graph Algorithms

library(igraph)

Graph Algorithms

query = "
MATCH (u1:User)-[:POSTS]->(:Tweet)-[:RETWEETS]->(t:Tweet),
      (u2:User)-[:POSTS]->(t)
RETURN u1.username AS from, u2.username AS to, COUNT(*) AS weight
"

edges = cypher(graph, query)
edges[1:5, ]
          from            to weight
1      STAT545      millerdl      1
2     tmickael hadleywickham      1
3   schmidtdav      RLangTip      1
4        rasbt hadleywickham      1
5 mike_scott88 hadleywickham      1

Graph Algorithms

ig = graph_from_data_frame(edges)
class(ig)
[1] "igraph"

Graph Algorithms

Who has the highest betweenness centrality in this subgraph?

\[betweenness(v) = \sum_{x \neq y \in V} \frac{\sigma_{xy}(v)}{\sigma_{xy}}\]

b = betweenness(ig)
b = sort(b, decreasing = TRUE)

b[1:5]
  hadleywickham      ArnoCandel         ucfagls groundwalkergmb 
             34               8               2               1 
        STAT545 
              0 

Graph Algorithms

Who has the highest closeness centrality in this subgraph?

\[closeness(v) = \frac{1}{\sum_{y} d(v, y)}\]

c = closeness(ig)
c = sort(c, decreasing = TRUE)

c[1:5]
      recology_   DataScienceLA BigDataTweetBot           h2oai 
   3.585900e-05    3.585900e-05    3.564300e-05    3.543084e-05 
        STAT545 
   3.522119e-05 

Graph Visualization

V(ig)$size = 4
V(ig)$color = "cyan"
V(ig)$label = NA

plot(ig, edge.arrow.size = 0.1)

Graph Visualization

clusters = cluster_edge_betweenness(ig)

V(ig)$color = clusters$membership

plot(ig, edge.arrow.size = 0.1)

Graph Visualization

library(visNetwork)

id = unique(c(edges$from, edges$to))
nodes = data.frame(id = id, label = id)

nodes[1:5, ]
            id        label
1      STAT545      STAT545
2     tmickael     tmickael
3   schmidtdav   schmidtdav
4        rasbt        rasbt
5 mike_scott88 mike_scott88

Graph Visualization

visNetwork(nodes, edges)

Graph Visualization

nodes$group = clusters$membership

visNetwork(nodes, edges)

Association Rules

library(arules)
library(tidyr)

Association Rules

\[ \{onions, potatoes\} \Rightarrow \{burgers\}\]

Association Rules

\[Lift(X \Rightarrow Y) = \frac{P(X \cap Y)}{P(X) \times P(Y)}\]

Association Rules

\[Lift(rstats \Rightarrow analytics) = \frac{P(rstats \cap analytics)}{P(rstats) \times P(analytics)}\]

query = "
MATCH (t:Tweet) WITH COUNT(*) AS total
MATCH (rstats:Hashtag {name:'rstats'}), 
      (ana:Hashtag {name:'analytics'})
WITH SIZE((rstats)-[:TAGS]->()) * 1.0 / total AS p_rstats,
     SIZE((ana)-[:TAGS]->()) * 1.0 / total AS p_ana,
     SIZE((rstats)-[:TAGS]->()<-[:TAGS]-(ana)) * 1.0 / total AS joint
RETURN joint / (p_rstats * p_ana) AS lift
"

cypher(graph, query)
       lift
1 0.7070568

Association Rules

query = "
MATCH (h:Hashtag)-[:TAGS]->(t:Tweet)
RETURN t.id, h.name AS hashtag
"

data = cypher(graph, query)
data[1:5, ]
                t.id hashtag
1 664963000126808064  rstats
2 664962472357593088  rstats
3 664960069033816064  rstats
4 664959329112453120  rstats
5 664958634615435264  rstats

Association Rules

data$present = TRUE
data = spread(data, hashtag, present, fill = FALSE)
data$t.id = NULL

data[1:5, 1:5]
  abdsc amazon amazondhanterasday2 analysis analytics
1 FALSE  FALSE               FALSE    FALSE     FALSE
2 FALSE  FALSE               FALSE    FALSE     FALSE
3 FALSE  FALSE               FALSE    FALSE     FALSE
4 FALSE  FALSE               FALSE    FALSE     FALSE
5 FALSE  FALSE               FALSE    FALSE     FALSE

Association Rules

data = as(data, "transactions")
data
transactions in sparse format with
 424 transactions (rows) and
 83 items (columns)

Association Rules

rules = apriori(data, parameter=list(support=0.05))
Parameter specification:
 confidence minval smax arem  aval originalSupport support minlen maxlen
        0.8    0.1    1 none FALSE            TRUE    0.05      1     10
 target   ext
  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

apriori - find association rules with the apriori algorithm
version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[83 item(s), 424 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [4 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].

Association Rules

d = inspect(sort(rules, by = "lift"))
d[1, c(1:3, 6)]
        lhs         rhs     lift
2 {bigdata} => {rstats} 1.021687

Conclusion

Workflow

Questions?

@_nicolemargaret


github.com/nicolewhite/RNeo4j