21 Recipes for Mining Twitter [Social Network Analysis] Hoon-Young Jung
1.5 Extracting a Retweet’s Origins Problem –You want to extract the originating source from a retweet. ( 리트윗된 글의 출처를 알고 싶다. ) Solution –If the tweet’s retweet_count field is greater than 0, extract name out of t he tweet’s user ( 트윗의 retweet_count 필드가 0 보다 큰 경우, 사용자 이름을 추출 ) –field; also parse the text of the tweet with a regular expression. 또한 정규식 표현으로 트윗의 텍스트 분석
1.5 Extracting a Retweet’s Origins Example Extracting retweet origins
1.5 Extracting a Retweet’s Origins Example Extracting retweet origins # Also, inspect the tweet for the presence of "legacy" retweet # patterns such as "RT" and "via" try: rt_origins += [ mention.strip() for mention in rt_patterns.findall(tweet['text'])[0][1].split() ] except IndexError, e: pass # Filter out any duplicates return for rto in rt_origins])) # Also, inspect the tweet for the presence of "legacy" retweet # patterns such as "RT" and "via" try: rt_origins += [ mention.strip() for mention in rt_patterns.findall(tweet['text'])[0][1].split() ] except IndexError, e: pass # Filter out any duplicates return for rto in rt_origins])) def get_rt_origins(tweet): # Regex adapted from # regular-expression-for-retweets rt_patterns = re.IGNORECASE) rt_origins = [] # Inspect the tweet to see if was produced with /statuses/retweet/:id # See if tweet.has_key('retweet_count'): if tweet['retweet_count'] > 0: rt_origins += [ tweet['user']['name'].lower() ] def get_rt_origins(tweet): # Regex adapted from # regular-expression-for-retweets rt_patterns = re.IGNORECASE) rt_origins = [] # Inspect the tweet to see if was produced with /statuses/retweet/:id # See if tweet.has_key('retweet_count'): if tweet['retweet_count'] > 0: rt_origins += [ tweet['user']['name'].lower() ]
1.5 Extracting a Retweet’s Origins Example Extracting retweet origins # Also, inspect the tweet for the presence of "legacy" retweet # patterns such as "RT" and "via" try: rt_origins += [ mention.strip() for mention in rt_patterns.findall(tweet['text'])[0][1].split() ] except IndexError, e: pass # Filter out any duplicates return for rto in rt_origins])) # Also, inspect the tweet for the presence of "legacy" retweet # patterns such as "RT" and "via" try: rt_origins += [ mention.strip() for mention in rt_patterns.findall(tweet['text'])[0][1].split() ] except IndexError, e: pass # Filter out any duplicates return for rto in rt_origins])) def get_rt_origins(tweet): # Regex adapted from # regular-expression-for-retweets rt_patterns = re.IGNORECASE) rt_origins = [] # Inspect the tweet to see if was produced with /statuses/retweet/:id # See if tweet.has_key('retweet_count'): if tweet['retweet_count'] > 0: rt_origins += [ tweet['user']['name'].lower() ] def get_rt_origins(tweet): # Regex adapted from # regular-expression-for-retweets rt_patterns = re.IGNORECASE) rt_origins = [] # Inspect the tweet to see if was produced with /statuses/retweet/:id # See if tweet.has_key('retweet_count'): if tweet['retweet_count'] > 0: rt_origins += [ tweet['user']['name'].lower() ]
1.5 Extracting a Retweet’s Origins Example Extracting retweet origins # Also, inspect the tweet for the presence of "legacy" retweet # patterns such as "RT" and "via" try: rt_origins += [ mention.strip() for mention in rt_patterns.findall(tweet['text'])[0][1].split() ] except IndexError, e: pass # Filter out any duplicates return for rto in rt_origins])) # Also, inspect the tweet for the presence of "legacy" retweet # patterns such as "RT" and "via" try: rt_origins += [ mention.strip() for mention in rt_patterns.findall(tweet['text'])[0][1].split() ] except IndexError, e: pass # Filter out any duplicates return for rto in rt_origins])) def get_rt_origins(tweet): # Regex adapted from # regular-expression-for-retweets rt_patterns = re.IGNORECASE) rt_origins = [] # Inspect the tweet to see if was produced with /statuses/retweet/:id # See if tweet.has_key('retweet_count'): if tweet['retweet_count'] > 0: rt_origins += [ tweet['user']['name'].lower() ] def get_rt_origins(tweet): # Regex adapted from # regular-expression-for-retweets rt_patterns = re.IGNORECASE) rt_origins = [] # Inspect the tweet to see if was produced with /statuses/retweet/:id # See if tweet.has_key('retweet_count'): if tweet['retweet_count'] > 0: rt_origins += [ tweet['user']['name'].lower() ]
1.5 Extracting a Retweet’s Origins Example Extracting retweet origins if __name__ == '__main__': # A mocked up array of tweets for purposes of illustration. # Assume tweets have been fetched from the /search resource or elsewhere. tweets = \ [ { 'text' : at #w00t' #... more tweet fields... }, { 'text' : example code at #w00t', 'retweet_count' : 1, 'user' : { 'name' : 'ptwobrussell‘ #... more user fields... } #... more tweet fields... }, #... more tweets... ] for tweet in tweets: print get_rt_origins(tweet) if __name__ == '__main__': # A mocked up array of tweets for purposes of illustration. # Assume tweets have been fetched from the /search resource or elsewhere. tweets = \ [ { 'text' : at #w00t' #... more tweet fields... }, { 'text' : example code at #w00t', 'retweet_count' : 1, 'user' : { 'name' : 'ptwobrussell‘ #... more user fields... } #... more tweet fields... }, #... more tweets... ] for tweet in tweets: print get_rt_origins(tweet)
1.6 Looking Up the Trending Topics Problem –You want to construct and analyze a graph data structure of retweet rela tionships for a set of query results. ( 쿼리 결과 집합에 대한 리트윗 관계 데이터 구조 그래프를 구축하고 분 석하고 싶다. ) Solution –Query for the topic, extract the retweet origins, and then use the Networ kX package to construct a graph to analyze. ( 주제에 대한 쿼리는 리트 윗 출처를 추출하고 분석 할 수 있는 그래프를 생성 할 NetworkX 패키지를 사용합니다. )
1.6 Looking Up the Trending Topics Example Creating a graph of retweet relationships # -*- coding: utf-8 -*- import sys import json import twitter import networkx as nx from recipe__get_rt_origins import get_rt_origins def create_rt_graph(tweets): g = nx.DiGraph() for tweet in tweets: rt_origins = get_rt_origins(tweet) if not rt_origins: continue for rt_origin in rt_origins: g.add_edge(rt_origin.encode('ascii', 'ignore'), tweet['from_user'].encode('ascii', 'ignore'), {'tweet_id': tweet['id']} ) return g # -*- coding: utf-8 -*- import sys import json import twitter import networkx as nx from recipe__get_rt_origins import get_rt_origins def create_rt_graph(tweets): g = nx.DiGraph() for tweet in tweets: rt_origins = get_rt_origins(tweet) if not rt_origins: continue for rt_origin in rt_origins: g.add_edge(rt_origin.encode('ascii', 'ignore'), tweet['from_user'].encode('ascii', 'ignore'), {'tweet_id': tweet['id']} ) return g
1.6 Looking Up the Trending Topics Example Creating a graph of retweet relationships if __name__ == '__main__': # Your query Q = ' '.join(sys.argv[1]) # How many pages of data to grab for the search results MAX_PAGES = 15 # How many search results per page RESULTS_PER_PAGE = 100 # Get some search results for a query twitter_search = twitter.Twitter(domain='search.twitter.com') search_results = [] for page in range(1,MAX_PAGES+1): search_results.append( twitter_search.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # tweepy.api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) ) # result_list = tweepy. api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # search_result.extend(result_list) all_tweets = [tweet for page in search_results for tweet in page['results']] # Build up a graph data structure g = create_rt_graph(all_tweets) # Print out some stats print >> sys.stderr, "Number nodes:", g.number_of_nodes() print >> sys.stderr, "Num edges:", g.number_of_edges() print >> sys.stderr, "Num connected components:", len(nx.connected_components(g.to_undirected())) print >> sys.stderr, "Node degrees:", sorted(nx.degree(g)) if __name__ == '__main__': # Your query Q = ' '.join(sys.argv[1]) # How many pages of data to grab for the search results MAX_PAGES = 15 # How many search results per page RESULTS_PER_PAGE = 100 # Get some search results for a query twitter_search = twitter.Twitter(domain='search.twitter.com') search_results = [] for page in range(1,MAX_PAGES+1): search_results.append( twitter_search.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # tweepy.api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) ) # result_list = tweepy. api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # search_result.extend(result_list) all_tweets = [tweet for page in search_results for tweet in page['results']] # Build up a graph data structure g = create_rt_graph(all_tweets) # Print out some stats print >> sys.stderr, "Number nodes:", g.number_of_nodes() print >> sys.stderr, "Num edges:", g.number_of_edges() print >> sys.stderr, "Num connected components:", len(nx.connected_components(g.to_undirected())) print >> sys.stderr, "Node degrees:", sorted(nx.degree(g))
1.6 Looking Up the Trending Topics Example Creating a graph of retweet relationships if __name__ == '__main__': # Your query Q = ' '.join(sys.argv[1]) # How many pages of data to grab for the search results MAX_PAGES = 15 # How many search results per page RESULTS_PER_PAGE = 100 # Get some search results for a query twitter_search = twitter.Twitter(domain='search.twitter.com') search_results = [] for page in range(1,MAX_PAGES+1): search_results.append( twitter_search.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # tweepy.api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) ) # result_list = tweepy. api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # search_result.extend(result_list) all_tweets = [tweet for page in search_results for tweet in page['results']] # Build up a graph data structure g = create_rt_graph(all_tweets) # Print out some stats print >> sys.stderr, "Number nodes:", g.number_of_nodes() print >> sys.stderr, "Num edges:", g.number_of_edges() print >> sys.stderr, "Num connected components:", len(nx.connected_components(g.to_undirected())) print >> sys.stderr, "Node degrees:", sorted(nx.degree(g)) if __name__ == '__main__': # Your query Q = ' '.join(sys.argv[1]) # How many pages of data to grab for the search results MAX_PAGES = 15 # How many search results per page RESULTS_PER_PAGE = 100 # Get some search results for a query twitter_search = twitter.Twitter(domain='search.twitter.com') search_results = [] for page in range(1,MAX_PAGES+1): search_results.append( twitter_search.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # tweepy.api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) ) # result_list = tweepy. api.search(q=Q, rpp=RESULTS_PER_PAGE, page=page) # search_result.extend(result_list) all_tweets = [tweet for page in search_results for tweet in page['results']] # Build up a graph data structure g = create_rt_graph(all_tweets) # Print out some stats print >> sys.stderr, "Number nodes:", g.number_of_nodes() print >> sys.stderr, "Num edges:", g.number_of_edges() print >> sys.stderr, "Num connected components:", len(nx.connected_components(g.to_undirected())) print >> sys.stderr, "Node degrees:", sorted(nx.degree(g))