- Install
- Connect to Neo4j
- Cypher
- Visualization
- Graph Algorithms
- Graph Visualization
- Association Rules
install.packages("RNeo4j")
library(RNeo4j)
graph = startGraph("http://localhost:7474/db/data/") graph
< Graph > $version [1] "2.3.0"
query = " MATCH (t:Tweet)-[:MENTIONS]->(u:User) RETURN u.username AS username, COUNT(t) AS mentions ORDER BY mentions DESC LIMIT 5 " data = cypher(graph, query) data
username mentions 1 hadleywickham 52 2 genetics_blog 48 3 Rbloggers 31 4 rstudio 29 5 lincolnmullen 22
library(ggplot2)
ggplot(data, aes(reorder(username, -mentions), mentions)) + geom_bar(stat="identity")
query = " MATCH (h:Hashtag)-[:TAGS]->(t:Tweet) WITH h, COUNT(t) AS tweets ORDER BY tweets DESC LIMIT 10 WITH COLLECT(h) AS hash UNWIND hash AS h1 UNWIND hash AS h2 MATCH (h1)-[:TAGS]->(:Tweet)<-[:TAGS]-(h2) RETURN h1.name, h2.name, COUNT(*) AS weight ORDER BY weight DESC "
hashtags = cypher(graph, query) head(hashtags)
h1.name h2.name weight 1 rstats datascience 48 2 datascience rstats 48 3 bigdata rstats 41 4 rstats bigdata 41 5 bigdata datascience 26 6 datascience bigdata 26
ggplot(hashtags, aes(h1.name, h2.name)) + geom_tile(aes(fill = weight)) + scale_fill_gradient(low = "white", high = "red") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(igraph)
query = " MATCH (u1:User)-[:POSTS]->(:Tweet)-[:RETWEETS]->(t:Tweet), (u2:User)-[:POSTS]->(t) RETURN u1.username AS from, u2.username AS to, COUNT(*) AS weight " edges = cypher(graph, query) edges[1:5, ]
from to weight 1 STAT545 millerdl 1 2 tmickael hadleywickham 1 3 schmidtdav RLangTip 1 4 rasbt hadleywickham 1 5 mike_scott88 hadleywickham 1
ig = graph_from_data_frame(edges) class(ig)
[1] "igraph"
Who has the highest betweenness centrality in this subgraph?
\[betweenness(v) = \sum_{x \neq y \in V} \frac{\sigma_{xy}(v)}{\sigma_{xy}}\]
b = betweenness(ig) b = sort(b, decreasing = TRUE) b[1:5]
hadleywickham ArnoCandel ucfagls groundwalkergmb 34 8 2 1 STAT545 0
Who has the highest closeness centrality in this subgraph?
\[closeness(v) = \frac{1}{\sum_{y} d(v, y)}\]
c = closeness(ig) c = sort(c, decreasing = TRUE) c[1:5]
recology_ DataScienceLA BigDataTweetBot h2oai 3.585900e-05 3.585900e-05 3.564300e-05 3.543084e-05 STAT545 3.522119e-05
V(ig)$size = 4 V(ig)$color = "cyan" V(ig)$label = NA plot(ig, edge.arrow.size = 0.1)
clusters = cluster_edge_betweenness(ig) V(ig)$color = clusters$membership plot(ig, edge.arrow.size = 0.1)
library(visNetwork) id = unique(c(edges$from, edges$to)) nodes = data.frame(id = id, label = id) nodes[1:5, ]
id label 1 STAT545 STAT545 2 tmickael tmickael 3 schmidtdav schmidtdav 4 rasbt rasbt 5 mike_scott88 mike_scott88
visNetwork(nodes, edges)
nodes$group = clusters$membership visNetwork(nodes, edges)
library(arules) library(tidyr)
\[ \{onions, potatoes\} \Rightarrow \{burgers\}\]
\[Lift(X \Rightarrow Y) = \frac{P(X \cap Y)}{P(X) \times P(Y)}\]
\[Lift(rstats \Rightarrow analytics) = \frac{P(rstats \cap analytics)}{P(rstats) \times P(analytics)}\]
query = " MATCH (t:Tweet) WITH COUNT(*) AS total MATCH (rstats:Hashtag {name:'rstats'}), (ana:Hashtag {name:'analytics'}) WITH SIZE((rstats)-[:TAGS]->()) * 1.0 / total AS p_rstats, SIZE((ana)-[:TAGS]->()) * 1.0 / total AS p_ana, SIZE((rstats)-[:TAGS]->()<-[:TAGS]-(ana)) * 1.0 / total AS joint RETURN joint / (p_rstats * p_ana) AS lift " cypher(graph, query)
lift 1 0.7070568
query = " MATCH (h:Hashtag)-[:TAGS]->(t:Tweet) RETURN t.id, h.name AS hashtag " data = cypher(graph, query) data[1:5, ]
t.id hashtag 1 664963000126808064 rstats 2 664962472357593088 rstats 3 664960069033816064 rstats 4 664959329112453120 rstats 5 664958634615435264 rstats
data$present = TRUE data = spread(data, hashtag, present, fill = FALSE) data$t.id = NULL data[1:5, 1:5]
abdsc amazon amazondhanterasday2 analysis analytics 1 FALSE FALSE FALSE FALSE FALSE 2 FALSE FALSE FALSE FALSE FALSE 3 FALSE FALSE FALSE FALSE FALSE 4 FALSE FALSE FALSE FALSE FALSE 5 FALSE FALSE FALSE FALSE FALSE
data = as(data, "transactions") data
transactions in sparse format with 424 transactions (rows) and 83 items (columns)
rules = apriori(data, parameter=list(support=0.05))
Parameter specification: confidence minval smax arem aval originalSupport support minlen maxlen 0.8 0.1 1 none FALSE TRUE 0.05 1 10 target ext rules FALSE Algorithmic control: filter tree heap memopt load sort verbose 0.1 TRUE TRUE FALSE TRUE 2 TRUE apriori - find association rules with the apriori algorithm version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt set item appearances ...[0 item(s)] done [0.00s]. set transactions ...[83 item(s), 424 transaction(s)] done [0.00s]. sorting and recoding items ... [3 item(s)] done [0.00s]. creating transaction tree ... done [0.00s]. checking subsets of size 1 2 3 done [0.00s]. writing ... [4 rule(s)] done [0.00s]. creating S4 object ... done [0.00s].
d = inspect(sort(rules, by = "lift"))
d[1, c(1:3, 6)]
lhs rhs lift 2 {bigdata} => {rstats} 1.021687