We're going to import Twitter data into Neo4j and then do various analysis on top of this dataset including plotting, graph algorithms, and graph visualizations.
The script for importing data from Twitter's API is in scripts/twitter.py
. This was used to upload tweets that contained either the words "ddtx16" or "datadaytexas" to Neo4j.
import os
import sys
import time
import requests
from py2neo import Graph, Node, Relationship
Neo4j supports uniqueness constraints on given label, property pairs.
graph = Graph()
graph.run("CREATE CONSTRAINT ON (u:User) ASSERT u.username IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (t:Tweet) ASSERT t.id IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (h:Hashtag) ASSERT h.name IS UNIQUE")
TWITTER_BEARER = os.environ["TWITTER_BEARER"]
headers = dict(accept="application/json", Authorization="Bearer " + TWITTER_BEARER)
payload = dict(
count=100,
result_type="recent",
lang="en",
q=sys.argv[1]
)
base_url = "https://api.twitter.com/1.1/search/tweets.json?"
def find_tweets(since_id):
payload["since_id"] = since_id
url = base_url + "q={q}&count={count}&result_type={result_type}&lang={lang}&since_id={since_id}".format(**payload)
r = requests.get(url, headers=headers)
tweets = r.json()["statuses"]
return tweets
def upload_tweets(tweets):
for t in tweets:
u = t["user"]
e = t["entities"]
tweet = Node("Tweet", id=t["id"])
graph.merge(tweet)
tweet["text"] = t["text"]
tweet.push()
user = Node("User", username=u["screen_name"])
graph.merge(user)
graph.merge(Relationship(user, "POSTS", tweet))
for h in e.get("hashtags", []):
hashtag = Node("Hashtag", name=h["text"].lower())
graph.merge(hashtag)
graph.merge(Relationship(hashtag, "TAGS", tweet))
for m in e.get('user_mentions', []):
mention = Node("User", username=m["screen_name"])
graph.merge(mention)
graph.merge(Relationship(tweet, "MENTIONS", mention))
reply = t.get("in_reply_to_status_id")
if reply:
reply_tweet = Node("Tweet", id=reply)
graph.merge(reply_tweet)
graph.merge(Relationship(tweet, "REPLY_TO", reply_tweet))
ret = t.get("retweeted_status", {}).get("id")
if ret:
retweet = Node("Tweet", id=ret)
graph.merge(retweet)
graph.merge(Relationship(tweet, "RETWEETS", retweet))
since_id = -1
while True:
try:
tweets = find_tweets(since_id=since_id)
if not tweets:
print("No tweets found.")
time.sleep(60)
continue
since_id = tweets[0].get("id")
upload_tweets(tweets)
print("{} tweets uploaded!".format(len(tweets)))
time.sleep(60)
except Exception as e:
print(e)
time.sleep(60)
continue
from scripts.vis import draw
options = {"User": "username", "Hashtag": "name"}
draw(graph, options, physics=True, limit=30)
Let's gather some basic insights by creating plotly
charts.
%load_ext cypher
import plotly.plotly as py
from plotly.graph_objs import *
Find hashtags ordered by the number of tweets they've tagged.
result = %cypher MATCH (hashtag:Hashtag)-[:TAGS]->(tweet:Tweet) \
WHERE hashtag.name <> 'rstats' \
RETURN hashtag.name AS hashtag, count(tweet) AS tweets \
ORDER BY tweets DESC LIMIT 5
df = result.get_dataframe()
df.head()
data = Data([Bar(x=df["hashtag"], y=df["tweets"])])
py.image.ishow({'data': data})
result = %cypher MATCH (tweet:Tweet) \
RETURN tweet.id, \
size((:Hashtag)-[:TAGS]->(tweet)) AS hashtags, \
size((tweet)-[:MENTIONS]->(:User)) AS mentions
df = result.get_dataframe()
del df["tweet.id"]
df.head()
hashtags = Histogram(x=df["hashtags"], opacity=0.75, name="Hashtags")
mentions = Histogram(x=df["mentions"], opacity=0.75, name="Mentions")
data = Data([hashtags, mentions])
layout = Layout(barmode="overlay")
fig = Figure(data=data, layout=layout)
py.image.ishow(fig)
result = %cypher MATCH (h:Hashtag) \
WHERE h.name <> "rstats" \
WITH h, size((h)-[:TAGS]->(:Tweet)) AS tags \
ORDER BY tags DESC \
LIMIT 15 \
\
WITH collect(h) AS top_hash \
UNWIND top_hash AS h1 \
UNWIND top_hash AS h2 \
\
MATCH (h1)-[:TAGS]->(:Tweet)<-[:TAGS]-(h2) \
WHERE h1.name < h2.name \
RETURN h1.name, h2.name, count(*) AS weight
df = result.get_dataframe()
df.head()
names = list(set(list(df["h1.name"]) + list(df["h2.name"])))
heat = [[0 for i in range(len(names))] for j in range(len(names))]
for idx, row in df.iterrows():
i = names.index(row["h1.name"])
j = names.index(row["h2.name"])
heat[i][j] = row["weight"]
import plotly.graph_objs as go
data = [go.Heatmap(z = heat, x = names, y = names)]
py.image.ishow({'data': data})
The typical workflow consists of retrieving a subgraph from Neo4j via Cypher and analyzing this graph in igraph
.
from igraph import Graph as IGraph
query = """
MATCH (user1:User)-[:POSTS]->(retweet:Tweet)-[:RETWEETS]->(tweet:Tweet),
(user2:User)-[:POSTS]->(tweet)
RETURN user1.username, user2.username, count(*) AS weight
"""
data = graph.run(query)
ig = IGraph.TupleList(data, weights=True)
ig
$betweenness(v) = \sum_{s, t \in V} \frac{\sigma_{st}(v)}{\sigma_{st}}$
The betweenness centrality of a node $v$ is the number of shortest paths that pass through $v$, $\sigma_{st}(v)$, divided by the total number of shortest paths, $\sigma_{st}$.
between = [(node["name"], node.betweenness()) for node in ig.vs]
top = sorted(between, key=lambda x: x[1], reverse=True)
top[:5]
$closeness(v) = \frac{1}{\sum_{x} d(v, x)}$
The closeness centrality is the reciprocal of a node's farness, or sum of its shortest path distances from all other nodes in the graph.
close = [(node["name"], node.closeness()) for node in ig.vs]
top = sorted(close, key=lambda x: x[1], reverse=True)
top[:5]
clusters = IGraph.community_walktrap(ig, weights="weight")
clusters = clusters.as_clustering()
len(clusters)
nodes = [{"id": node.index, "label": node["name"]} for node in ig.vs]
for node in nodes:
node["group"] = clusters.membership[node["id"]]
nodes[:5]
edges = [{"from": x[0], "to": x[1]} for x in ig.get_edgelist()]
edges[:5]
from scripts.vis import vis_network
vis_network(nodes, edges, physics=True)
Let's visualize (in 3D!) the structure of users retweeting and replying to other users.
import jgraph
query = """
MATCH (user1:User)-[:POSTS]->(:Tweet)-[:RETWEETS|REPLY_TO]->(:Tweet)<-[:POSTS]-(user2:User)
RETURN ID(user1), ID(user2)
"""
data = graph.run(query)
tup = [tuple(x) for x in data]
jgraph.draw(tup)