Social circles form huge parts of our lives. They include individuals that we interact with, how often we interact with them and the mode of communication used for interactions. With the rise of digital communication and networking, these interactions are carefully recorded through the use of modern tools and algorithms. A brief look at our social networking sites, such as Facebook and LinkedIn, allows us to easily gather information that characterizes our social circles, such as network of friends, frequency of communication, etc.
Unfortunately, information about our social circles has not always been as readily available. People who lived before the 21st century did not have access to the data-rich information of digital communication, nor did they have the tools to analyze large quantities of daily interactions. However, by synthesizing modern concepts with historical records, we could potentially unearth some information regarding these individuals' social circles. In the case of this project, we will use Natural Language Processing (NLP) to construct a social network for the bible.
To skip the methodology and proceed straight into the network, please click here.
First load the necessary modules for this exercise.
import sys
sys.path.append('shared/')
import defaults as _d
import helper as _h
# Load All Main Modules
_d.load({"pd":"pandas",
"math":"math",
"cl":"collections",
"np":"numpy",
"sp":"scipy",
"re":"re",
"mpl":"matplotlib",
"plotly":"plotly",
"nltk":"nltk",
"wordcloud":"wordcloud",
"PIL":"PIL",
"operator":"operator",
"nx":"networkx",
"sklearn":"sklearn",
"random":"random"},
globals())
# Load All Submodules
from collections import OrderedDict
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
import plotly.offline as py
import plotly.graph_objs as py_go
from sklearn.cluster import MeanShift
# If you can't find the module, run nltk.download() in python
from nltk import sent_tokenize, word_tokenize
_d.stylize()
Also, we will construct helper functions to be used later on.
# -------------------------------------
# Genre-Related Functions
# -------------------------------------
def __get_genre_groups():
global _genre_group
if "_genre_group" not in globals():
_genre_group = bible.groupby("Genre",sort=False)
return _genre_group
def __get_genre_colors():
global _genre_colors
if "_genre_colors" not in globals():
color_pal = _d.get_color("palette")(len(__get_genre_groups()))
color_dict = dict()
ind = 0
for name, _ in __get_genre_groups():
color_dict[name] = color_pal[ind]
ind += 1
_genre_colors = color_dict
return _genre_colors
def __get_genre_legends(rev = True):
global _genre_legends
global _genre_legends_rev
if "_genre_legends" not in globals():
_genre_legends = [mpatches.Patch(color=_d.bg_color,label="Genre")]
for name, group in __get_genre_groups():
legend_text = name + " (" + group.index[0]
if (len(group.index) > 1):
legend_text += " - " + group.index[-1]
legend_text += ")"
_genre_legends.append(mpatches.Patch(color=__get_genre_colors()[name], label=legend_text))
_genre_legends_rev = _genre_legends[:0:-1]
_genre_legends_rev.insert(0,_genre_legends[0])
if rev:
return _genre_legends_rev
else:
return _genre_legends
# -------------------------------------
# Word-Cloud Related Functions
# -------------------------------------
def __word_cloud(input, fig_size = (20,10), image = None, colors = None):
# Step 1: If there is an image specified, we need to create a mask
mask = None
if (image != None):
mask = np.array(PIL.Image.open(image))
if (colors == "image_colors"):
colors = wordcloud.ImageColorGenerator(mask)
# Step 2: Set up default colors
def_colors = mpl.colors.ListedColormap(_d.get_color())
# Step 3: Generate Word Cloud
#https://stackoverflow.com/questions/43043437/wordcloud-python-with-generate-from-frequencies
wc = wordcloud.WordCloud(height=fig_size[1]*100,
width=fig_size[0]*100,
font_path="fonts/{}.ttf".format(_d.def_font),
background_color=_d.bg_color,
mask = mask,
colormap = def_colors,
color_func = colors).generate_from_frequencies(input)
# Step 4: Plot Word Cloud
plt.figure(figsize=fig_size)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
def __wc_color_func(character_freq_by_genre):
# Create color functions to determine the genre most associated with the character
def color_func(word, font_size, position, orientation, **kwargs):
most_common_genre = character_freq_by_genre[word].most_common(1)[0][0]
intensity = 1. * character_freq_by_genre[word][most_common_genre] / sum(character_freq_by_genre[word].values())
return _d.pollute_color(__min_color, __get_genre_colors()[most_common_genre],intensity)
return color_func
__get_legend_separator = mpatches.Patch(color=_d.bg_color,label="")
def __get_minmax_legends(input, title, key_format = "{:.2f}"):
output = []
output.append(mpatches.Patch(color=_d.bg_color,label=title))
max_item = max(input.items(), key=operator.itemgetter(1))
output.append(mlines.Line2D([0], [0], marker='o', color=_d.bg_color, label="Max: " + key_format.format(max_item[1]) + " - " + max_item[0],
markerfacecolor=_d.ltxt_color, markersize=20))
min_item = min(input.items(), key=operator.itemgetter(1))
output.append(mlines.Line2D([0], [0], marker='o', color=_d.bg_color, label="Min: " + key_format.format(min_item[1]) + " - " + min_item[0],
markerfacecolor=_d.ltxt_color, markersize=10))
return output
__min_color = _d.pollute_color(_d.bg_color,_d.txt_color,0.4)
def __get_saturate_legends(title):
output = []
output.append(mpatches.Patch(color=_d.bg_color,label=title))
output.append(mpatches.Patch(color=_d.get_color(0),label="Concentrated In 1 Genre"))
output.append(mpatches.Patch(color=_d.pollute_color(__min_color,_d.get_color(0),0.3), label="Spread Out Across\nMultiple Genres"))
return output
In this exercise, we will be using the bible corpus from Kaggle. The data will be stored in abbreviated book keys, with each book containing the following attributes:
# Get all book statistics
abb = pd.read_csv("data/key_abbreviations_english.csv")\
.query('p == 1')[["a","b"]]\
.rename(columns={"a" : "Key"})
ot_nt = pd.read_csv("data/key_english.csv")\
.rename(columns={"n" : "Name", "t" : "Testament"})
genres = pd.read_csv("data/key_genre_english.csv")\
.rename(columns={"n" : "Genre"})
# Load the main biblical text
bible = pd.read_csv("data/t_asv.csv")\
.groupby("b", as_index=False)\
.agg({"c": pd.Series.nunique, "v": "size", "t":" ".join})\
.rename(columns={"c": "Chapters","v": "Verses","t": "Text"})
# Perform some cleaning
bible['Text'] = bible['Text'].apply(lambda t: re.sub("[`]|['][^s]","",t))
# Join the remaining book statistics
bible = bible.join(abb.set_index('b'), on='b')\
.join(ot_nt.set_index('b'), on='b')\
.join(genres.set_index('g'), on='g')\
.drop(['b', 'g'], axis=1)\
.set_index('Key')\
[["Name","Testament","Genre","Chapters","Verses","Text"]]
# Show the first few lines
bible.head(5)
We will also derive some language statistics from each book, mainly:
# Add Sentences and Words columns
bible["Sentences"] = pd.Series(0, index=bible.index)
bible["Words"] = pd.Series(0, index=bible.index)
# Save Tokens
def get_tokens():
sent_tokens = OrderedDict()
word_tokens = OrderedDict()
for i, r in bible[["Text"]].iterrows():
txt = r.str.cat()
sent_tokens[i] = sent_tokenize(txt)
word_tokens[i] = word_tokenize(txt)
return (sent_tokens, word_tokens)
sent_tokens, word_tokens = _h.cache(get_tokens, "bible_tokens")
for i in bible.index:
bible.at[i,'Sentences'] = len(sent_tokens[i])
# Remove Punctuation
bible.at[i,'Words'] = len([w for w in word_tokens[i] if re.match('\w+',w)])
# Show
bible[["Name","Testament","Genre","Chapters","Verses","Sentences","Words"]].head(5)
One of the most intuitive ways to understand the books' uneven distribution is to assume that we are doing devotions each chapter a day. Under such a scenario, we will have the following timeline:
plt.figure(figsize=(20,5))
# Create Plots
yticks = []
ylabels = []
x_progress = 0
x_length = sum(bible["Chapters"])
y_progress = 0
y_length = len(bible["Chapters"])
for name, group in __get_genre_groups():
row_ids = [ bible.index.get_loc(i) for i in group.index ]
# Part 1: Bars When Genre Is Still Being Read
length = 0
# For each book in the genre
for idx in row_ids:
# If we are reading this book in the anniversary
if (math.floor((x_progress + length)/365) < math.floor((x_progress + length + bible["Chapters"][idx])/365)):
yticks.append(idx + 1)
ylabels.append("{} ({}%)".format(bible.index[idx],round(idx/y_length * 100)))
plt.broken_barh([(x_progress + length, bible["Chapters"][idx])],
(y_progress, (idx + 1) - y_progress),
facecolors = __get_genre_colors()[name])
length += bible["Chapters"][idx]
# Part 2: Bars When Genre has Been Read
plt.broken_barh([(x_progress + length, x_length - x_progress - length)],
(y_progress, max(row_ids) + 1 - y_progress),
facecolors = __get_genre_colors()[name])
x_progress += length
y_progress = max(row_ids) + 1
# Add Titles and Grid
plt.title("Chapter Distribution by Book")
plt.grid(color=_d.fade_color(_d.ltxt_color,0.5), linestyle='dashed')
# Add X-Axis Details
plt.xlabel("Time Since Start")
xticks = [365, 2 * 365, 3 * 365 ,sum(bible["Chapters"])]
xlabels = [ "Year 1", "Year 2", "Year 3", "Year 3\nMonth 3" ]
plt.xticks(xticks, xlabels)
plt.xlim(0,x_length)
# Add Y-Axis Details
yticks.append(y_length)
ylabels.append("{} ({}%)".format(bible.index[-1],round(1 * 100)))
plt.ylabel("% of Books Completed")
plt.yticks(yticks, ylabels)
plt.ylim(0, y_length)
# Add Legends
plt.legend(handles=__get_genre_legends(), bbox_to_anchor=[1.27, 1.0])
plt.show()
By the 1st year, we would have only completed 18% of the bible. If this is not discouraging enough, after a further year, we would still not have completed the Old Testament (Law to Prophets). However, upon reaching the New Testament (Gospels to Apocalyptic), we could complete the whole set of books within 9 months. The Old Testament is deceivingly at least 3 times longer than the New Testament!
Assuming that the average human reads 200 words per minute, we can also estimate how long it will take to read 1 chapter a day:
bible["Minutes_p_Chapter"] = bible["Words"] / bible["Chapters"] / 200.
inputs = []
deg_incr = 360. / len(bible.index)
for name, group in __get_genre_groups():
# Insert Legend Item
inputs.append(
py_go.Scatterpolar(
r = [0, 0, 0, 0],
theta = [0, 0, 0, 0],
name = name,
legendgroup = name,
mode = 'none',
fill = 'toself',
fillcolor = __get_genre_colors()[name],
showlegend = True
)
)
# Insert Each Book
for key, val in group["Minutes_p_Chapter"].items():
inputs.append(
py_go.Scatterpolar(
r = [0, val, val, 0],
theta = [0,bible.index.get_loc(key)*deg_incr,(bible.index.get_loc(key)+1)*deg_incr,0],
name = bible["Name"][key],
legendgroup = name,
mode = 'none',
hoverinfo ='text',
text=bible["Name"][key] + ": " + "{:.1f}".format(val) + " min",
fill = 'toself',
fillcolor = __get_genre_colors()[name],
showlegend = False
)
)
layout = py_go.Layout(_d.py_layout)
layout["autosize"] = False
layout["width"] = 450
layout["height"] = 350
layout["margin"] = dict(t=80,l=0,r=0,b=20)
layout["title"] = "Minutes Required to Read a Chapter"
layout["polar"]["angularaxis"]["visible"]=False
fig = py_go.Figure(data=inputs, layout=layout)
py.iplot(fig, config=_d.py_config)
From the chart above, we conclude that chapter lengths across books are varied as well. For example, a chapter in 1 Kings will take around 5.5 minutes to read, while a chapter in Psalms will take around 1.5 minutes.
After obtaining an overview of the bible, we move to investigate the occurrences of various characters in the book.
The first point of interest is how much God appears at different books in the bible:
def find_occurence(regex):
output = OrderedDict()
for name, group in __get_genre_groups():
l = [len(re.findall(regex,wt.str.cat())) for _, wt in group[["Text"]].iterrows()]
output[name] = (len(l),sum(l)/len(l))
return output
entityToSearch = OrderedDict([('God', 'God|Lord|GOD|LORD'),
('Father','Jehovah|Father'),
('Son','Jesus|Christ|Emmanuel'),
('Spirit','Spirit')])
ind = 0
# Construct Plots for Each Entity
f, splt = plt.subplots(1,len(entityToSearch.items()), figsize=(20,5))
for title, regex in entityToSearch.items():
occurences = find_occurence(regex)
splt[ind].set_title(title)
splt[ind].set_xticks([])
splt[ind].set_yticks([])
x = 0
for n, v in occurences.items():
splt[ind].bar([x + v[0]/2],
[v[1]],
color = __get_genre_colors()[n],
width = v[0])
x += v[0]
ind += 1
# Insert Legends
plt.legend(handles=__get_genre_legends(False), bbox_to_anchor = [2.2, 1.05])
plt.show()
Unsurprisingly, words associated with God the Father (Jehovah/Father) appear prominently in the Old Testament, while words associated with God the Son (Jesus/Christ) hit high frequencies in the Gospel narratives. Word counts of the Spirit appear the highest in Acts. This sequence is in line with the story of the Gospel, where the events first transcribed were between God the Father and His people, followed by Jesus Christ and his believers, and finally with the Holy Spirit and the church.
(Note: One limitation of such an approach is the failure to capture symbols pointing to God. For example, words such as "Lamb" in Revelations correspond to Christ, but such symbols were excluded as they would introduce false positives.)
Using external sources, we can also obtain a list of major characters in the bible. This list can then be used as a reference for detecting names:
# Characters obtained from http://bibleblender.com/2014/biblical-lessons/biblical-history/complete-list-of-major-minor-characters-in-bible
characters_regex = 'Adam|Seth|Enos|Kenan|Mahalalel|Jared|Enoch|Methuselah|Lamech|Noah|Shem|Adam|Cain|Enoch|Irad|Mehujael|Methusael|Lamech|Tubal-cain|Arpachshad|Shelah|Eber|Peleg|Reu|Serug|Nahor|Terah|Abraham|Isaac|Jacob|Judah|Perez|Hezron|Ram|Amminadab|Nahshon|Salmon|Boaz|Obed|Jesse|David|Abel|Kenan|Enoch|Noah |Abraham|Isaac|Jacob|Joseph|Sarah|Rebecca|Rachel|Leah|Moses|Aaron|Miriam|Eldad|Medad|Phinehas|Joshua|Deborah|Gideon|Eli|Elkanah|Hannah|Abigail|Samuel|Gad|Nathan|David|Solomon|Jeduthun|Ahijah|Elijah|Elisha|Shemaiah|Iddo|Hanani|Jehu|Micaiah|Jahaziel|Eliezer|Zechariah|Huldah|Isaiah|Jeremiah|Ezekiel|Daniel|Hosea|Joel|Amos|Obadiah|Jonah|Micah|Nahum|Habakkuk|Zephaniah|Haggai|Zechariah|Malachi|Beor|Balaam|Job|Amoz|Beeri|Baruch|Agur|Uriah|Buzi|Mordecai|Esther|Oded|Azariah|Abimelech|Saul|Ish-boseth|David|Solomon|Jeroboam|Nadab|Baasha|Elah|Zimri|Tibni|Omri|Ahab|Ahaziah|Jehoram|Jehu|Jehoahaz|Jehoash|Jeroboam|Zechariah|Shallum|Menahem|Pekahiah|Pekah|Hoshea|Rehoboam|Abijam|Asa|Jehoshaphat|Jehoram|Ahaziah|Athaliah|Jehoash|Amaziah|Uzziah|Jotham|Ahaz|Hezekiah|Manasseh|Amon|Josiah|Jehoahaz|Jehoiakim|Jeconiah|Zedekiah|Simon|John|Aristobulus|Alexander|Hyrcanus|Aristobulus|Antigonus|Herod|Herod|Herod|Philip|Salome|Agrippa|Agrippa|Simon|Aaron|Eleazar|Eli|Phinehas|Asher|Benjamin|Dan|Gad|Issachar|Joseph|Ephraim|Manasseh|Judah|Levi|Naphtali|Reuben|Simeon|Zebulun|Jesus|Mary|Joseph|James|Jude|Joses|Simon|Peter|Andrew|James|John|Philip|Bartholomew|Thomas|Matthew|James|Judas|Simon|Judas|Matthias|Paul|Barnabas|James|Jude|Caiaphas|Annas|Zechariah|Agabus|Anna|Simeon|John|Apollos|Aquila|Dionysius|Epaphras|Joseph|Lazarus|Luke|Mark|Martha|Mary|Mary|Nicodemus|Onesimus|Philemon'
character_freq = []
for name, group in __get_genre_groups():
names = [re.findall(characters_regex,wt.str.cat()) for _, wt in group[["Text"]].iterrows()]
l = [(w,name) for l in names for w in l]
character_freq.extend(l)
# The frequency of each character occurence by genre
character_freq = nltk.ConditionalFreqDist(character_freq)
# Plot word cloud for each name
inputs = {}
for n, fd in character_freq.items():
inputs[n] = sum(fd.values())
__word_cloud(inputs, colors=__wc_color_func(character_freq))
# Titles
plt.title("Major Character Occurences")
# Legends
legend_cloud = list(__get_genre_legends(False))
legend_cloud.append(__get_legend_separator)
legend_cloud.extend(__get_saturate_legends("Concentration"))
legend_cloud.append(__get_legend_separator)
legend_cloud.extend(__get_minmax_legends(inputs, "Word Count","{:d}"))
plt.legend(handles=legend_cloud, bbox_to_anchor = [1.31, 1.])
plt.show()
Based on the graph, David appears the most in the bible. In addition, his appearances are concentrated within the History genre. This is in stark-contrast to Jesus, whose name appeared across multiple genres.
In order to construct a social network, we first need to identify relevant characters in the bible. One approach is to find a list of names from external sources, and then use that to identify the names. However, this method is unscalable. To illustrate, suppose we would like to construct a similar network for "Oliver Twist". Then, we would need to find a list of names associated with the book. But what happens if we are not able to find such a list?
Therefore, to reduce reliance on external sources, we need to develop a more robust approach for name-identification.
Fortunately, we are able to capture names due to the nature of English linguistics. Names fall under the category of "Proper Nouns", which we can detect using Part-of-Speech (POS) tagging:
def get_proper_noun_tokens():
tagged_word_tokens = OrderedDict((n, nltk.tag.pos_tag(wt)) for n, wt in word_tokens.items())
# Extract Only Proper Nouns and Add Index
proper_noun_tokens = OrderedDict((n, [(i, w[0]) for i, w in enumerate(wt) if w[1] == "NNP"]) for n, wt in tagged_word_tokens.items())
return proper_noun_tokens
proper_noun_tokens = _h.cache(get_proper_noun_tokens, "ppn_tokens")
# Print 100 Most Common Words
noun_freq = nltk.FreqDist(w for n,wt in proper_noun_tokens.items() for i, w in wt)
", ".join([n for n, v in noun_freq.most_common(50)])
Based on the text above, we have captured a majority of names in the bible. However, there are also some false positives such as O, Go, Thy, Ye that need to be removed. It is also interesting to see entities other than people being detected (e.g. Jerusalem, Babylon).
The first case to handle is the occurrence of words which are not proper nouns (O, Go, Thy, Ye). To solve this, we simply need to exclude them from consideration:
false_npp = ['O','Thou','Behold','Go','Thy','Ye','My','A','Yea','Thus','Come',
'Therefore','Wherefore','Be','So','Hear','ye','Psalm','Selah','Arise','Woe','King','Speak',
'Almighty','Who','How','Chief','thy','Fear','Musician','Which','High','Take','Most',
'Shall','Lo','Let','Praise','Make','Nay','Say','River','Art','Amen','South','Lest',
'Bring','Oh','Remember','Did','Teacher','Sea','Whosoever','Do','Every','Unto','Know',
'Are','Mine','See','Tell','Whoso','Gods','Wilt','Red','Holy','[',']','Mount', 'TR','Please',
'Tent','Man','Passover','Meeting','Will','Again','Whoever','Savior','Ai','No','May','Heaven',
'Whose','unto','Ah','Bless','Ascribe','Return','Seek','Day','Night','journeyed','Sit','Feed','Sirs','Prepare',
'Good','Follow','Psalmof','Render']
# Extract Only Proper Nouns and Add Index
proper_noun_tokens = OrderedDict((n, [(i, w) for i, w in wt if w not in false_npp]) for n, wt in proper_noun_tokens.items())
# Print 100 Most Common Words after excluding False Proper Nouns
noun_freq = nltk.FreqDist(w for n,wt in proper_noun_tokens.items() for i, w in wt)
", ".join([n for n, v in noun_freq.most_common(50)])
The second case to consider is non-human entities. Some examples of these are nations (Jerusalem, Babylon), locations (Galilee), symbols (Lord, Father, Son) and false idols (Baal). Since the relationships between non-human entities can yield useful insights, we will not be excluding such words and instead expand our scope from humans to entities.
Using the Proper Noun approach, we can subsequently plot these entities into a word cloud:
# The frequency of each character occurence by genre
character_freq = nltk.ConditionalFreqDist((w[1],bible["Genre"][n]) for n,wt in proper_noun_tokens.items() for w in wt)
# Plot word cloud for each name
inputs = {}
for n, fd in character_freq.items():
inputs[n] = sum(fd.values())
__word_cloud(inputs, colors=__wc_color_func(character_freq))
# Titles
plt.title("Entities in the Bible")
# Legends
legend_cloud = list(__get_genre_legends(False))
legend_cloud.append(__get_legend_separator)
legend_cloud.extend(__get_saturate_legends("Concentration"))
legend_cloud.append(__get_legend_separator)
legend_cloud.extend(__get_minmax_legends(inputs, "Word Count","{:d}"))
plt.legend(handles=legend_cloud, bbox_to_anchor = [1.31, 1.])
plt.show()
As can be seen, we have now expanded the set of major characters (David, Jesus) into a larger entity of names, nations and symbols (amongst others). There are also some interesting patterns emerging. For once, the word Jesus is dispersed across multiple genres, while the word Christ is concentrated within the Epistles!
After obtaining the list of entities to analyze, we can begin constructing the network.
The first step is to determine the building blocks. Vertices can be defined as entities obtained in the previous chapter. For simplicity, an edge between two vertices A and B exists if the two are within a certain number of words near each other.
Since connections can vary between acquaintances to best friends, it is also important to quantify the degree of connectivity between two vertices. To account for this, we will define two measures, intimacy and proximity.
Intimacy is the degree of closeness between two entities. A higher intimacy indicates that two entities are close, while a lower intimacy suggests that the two are mere acquaintances.
Calculating intimacy involves the interplay of two factors: the degree of repetition and the number of words. If two entities are close to each other multiple times, then it is reasonable to suggest that they are closer than another two whose relationship is only stated once.
In addition, for each time there is a connection, we also have to account for the number of words between them. A distance of 0 may imply that both entities are referring to the same person (Jesus Christ), while a longer distance may imply that both entities are several generations apart (Matthew 1).
This measure is important when determining the position of each node in the graph.
Proximity is similar to intimacy, except that a smaller proximity implies closeness, while a larger proximity suggests distance. It is calculated by taking the inverse of intimacy and is used to determine "how near" one entity is to another.
vertices = inputs
# Construction parameters
intimacies = {}
dist_l = 50
decay = 0.25
# Calculate the Degree of Intimacy
# The closer the names are together, the higher the degree of intimacy
# The more repetitions of names being close together, the higher the degree of intimacy
# Loop Through Each Book
for k, l in proper_noun_tokens.items():
# Loop Through Each Entity
for i in range(len(l)):
ind, ent = l[i]
# On a fixed entity, compare with neighboring entities
for i_hat in range(i+1, min(i+1+dist_l, len(l))):
ind_hat, ent_hat = l[i_hat]
# If the two entities are pretty close, create an edge connecting these two
if (abs(ind - ind_hat) <= dist_l and ent != ent_hat):
k = tuple(sorted([ent, ent_hat]))
if (k not in intimacies):
intimacies[k] = np.exp(-1. * decay * abs(ind - ind_hat))
else:
intimacies[k] += np.exp(-1. * decay * abs(ind - ind_hat))
# Calculate proximity, which is the inverse of intimacy
max_intimacy = max(intimacies.values())
proximities = dict((k, 1. * max_intimacy / v) for k,v in intimacies.items())
Having defined the vertices and edges, we can now construct the social network.
# Construct the Graph
G = nx.Graph()
G.add_nodes_from(w for w,_ in vertices.items())
sorted_vertices = sorted(vertices.items(), key=operator.itemgetter(1), reverse = True)
G.add_weighted_edges_from((k[0], k[1], v) for k, v in intimacies.items())
nx.set_edge_attributes(G, values=proximities, name="proximity")
############################### Plotly Construction ###########################
# For illustrative purposes, let us show only the top 50 entities
n_show = 50
sub_G = G.subgraph(n for n, _ in sorted_vertices[:n_show])
pos = nx.spring_layout(sub_G, k=3.0)
############################### Prepare Vertices ##############################
# Construct Vertices Information for Plotly
v_plotly = {}
for node in sub_G.nodes():
# Create Hover Text
top_edges = sorted(G[node].items(),key=lambda v: v[1]["proximity"])[:5]
appearances = vertices[node]
connections = G.degree(node)
hovertext = "<b>Name: </b>{}<br>".format(node) + \
"<b>Word Appearances:</b> {:d}<br>".format(appearances) + \
"<b># Connections:</b> {:d}<br>".format(connections) + \
"<b>Closest Neighbors (and Proximity):</b><br>" + \
"<br>".join("{} ({:.0f})".format(e[0], e[1]["proximity"]) for e in top_edges)
v_plotly[node] = {
"x": pos[node][0],
"y": pos[node][1],
"size": min(vertices[node] / 500. * 30.,30),
"color": _d.fade_color(_d.txt_color,0.3),
"hovertext": hovertext,
"rank": 9999 # To determine which color to be plotted
}
############################### Prepare Edges ##############################
e_plotly = {}
# Define Edge Groups for Plotly
edge_groups = OrderedDict([
("Very Close (1-5)", {
"range": [0, 5],
"color": _d.get_color(0),
"linew": 3.
}),
("Close (6-25)", {
"range": [6, 25],
"color": _d.fade_color(_d.get_color(0),0.4),
"linew": 2.
}),
("Normal (26-75)", {
"range": [26, 75],
"color": _d.fade_color(_d.get_color(0),0.2),
"linew": 1.
}),
("Far (76-100)", {
"range": [76, 100],
"color": _d.fade_color(_d.get_color(0),0.1),
"linew": 0.75
}),
("Very Far (>100)", {
"range": [101, math.inf],
"color": _d.fade_color(_d.get_color(0),0.1),
"linew": 0.25
})
])
def find_edge_group_item(proximity):
for k, v in edge_groups.items():
l_b, u_b = v["range"]
if l_b <= proximity and proximity <= u_b:
return (k, v)
# Construct Edges for Plotly
for edge in sub_G.edges():
# Get Distance and Group
proximity = int(G[edge[0]][edge[1]]["proximity"])
g_name, g_attrs = find_edge_group_item(proximity)
# Positions
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
e_plotly[edge[0] + "-" + edge[1]] = {
"nodes" : [edge[0], edge[1]],
"x": [x0, x1],
"y": [y0, y1],
"group": g_name,
"color": g_attrs["color"],
"linewidth": g_attrs["linew"],
"proximity": proximity
}
# Update Information on Vertices End
v_plotly[edge[0]]["rank"] = min(proximity, v_plotly[edge[0]]["rank"])
v_plotly[edge[1]]["rank"] = min(proximity, v_plotly[edge[1]]["rank"])
# Update Vertices Color Based on Ranking
for k, v in v_plotly.items():
_, g_attr = find_edge_group_item(v["rank"])
v["color"] = g_attr["color"]
############################### Draw Elements ##############################
# Create Elements For Plotly
data = []
# Edges First So That Vertices Are In Front of Nodes
for k,v in sorted(e_plotly.items(), key=lambda e: e[1]["proximity"], reverse=True):
data.append(py_go.Scatter(
mode="lines",
line=py_go.scatter.Line(
width=v["linewidth"],
color=v["color"]
),
x=v["x"],
y=v["y"],
legendgroup=v["group"],
showlegend = False,
hoverinfo="none"
))
# Vertices Next
data.append(py_go.Scatter(
mode='markers+text',
marker=py_go.scatter.Marker(
color=[v["color"] for _,v in v_plotly.items()],
size=[v["size"] for _,v in v_plotly.items()]),
textposition='bottom center',
x=[v["x"] for _,v in v_plotly.items()],
y=[v["y"] for _,v in v_plotly.items()],
text=[k for k in v_plotly.keys()],
showlegend = False,
hoverinfo='text',
hovertext= [v["hovertext"] for _,v in v_plotly.items()]
))
# Append Legends
data.append(py_go.Scatter(
name="Proximity",
mode="none",
x = [0,0], y = [0,0],
showlegend = True,
hoverinfo="none"
))
data.extend([ py_go.Scatter(
name=k,
mode="lines",
line=py_go.scatter.Line(
color=v["color"]
),
x = [0,0], y = [0,0],
legendgroup = k,
showlegend = True,
hoverinfo="none"
) for k, v in edge_groups.items() ])
# Layouts
layout = py_go.Layout(_d.py_layout)
layout["title"] = "Social Network of Top " + str(n_show) + " Entities in The Bible"
layout["title"]["x"] = 0.5
layout["width"] = 700
layout["height"] = 475
layout["hovermode"]='closest'
layout["xaxis"] = py_go.layout.XAxis(showgrid=False, zeroline=False, showticklabels=False)
layout["yaxis"] = py_go.layout.YAxis(showgrid=False, zeroline=False, showticklabels=False)
fig = py_go.Figure(data=data,layout=layout)
py.iplot(fig, config=_d.py_config)
The above chart provides a bird's eye view of the social network. Some of these relationships are expected, such as how God, Jehovah, Lord, Jesus and Christ are closely interconnected to each other. Some relationships are surprising when compared to another. (For example, Moses has a closer proximity to God than both Abraham and David.) It is also interesting to see some of the lesser-known relationships surfacing in the network (e.g. Joseph's children, Ephraim and Manasseh).
It can be extremely difficult to digest information just by referencing the chart above. In the next section, we will be studying the network by compressing the information into more palatable formats.
One important application of social network analysis is the ability to understand how "close" you are to other strangers. For example, suppose you wanted to pass a letter to the President through "a friend-of-a-friend approach". How many intermediaries must the letter be passed to before reaching the President? This number is what psychologist Stanley Migram defines as the degree of separation between you and the President. According to his small-world experiment, any two people can be connected via a maximum of six degrees of separation.
In this section, we will study how close an entity is to the most important person in the bible - God. This is done through calculating the shortest path between God and the entity in focus.
# Calculate Entities' Proximity To Each Other
def get_proximities():
return dict(nx.all_pairs_dijkstra_path_length(G, weight='proximity'))
proximity_to = _h.cache(get_proximities, "all_dijkstra_paths")
# Parameters:
root_node="God"
max_distance = 150
horizontal_nodes = ['God','Jehovah','Israel','Lord','Jesus','Moses']
max_size_highlight = 30
exclude_highlight_nodes = ['Nazareth','Saviour','Redeemer','Haggai']
to_root = proximity_to[root_node]
# Find Entities that are Within max_distance distances away from root node
to_root_sorted = sorted(to_root.items(), key=operator.itemgetter(1))
to_root_sorted = [(k,v) for k,v in to_root_sorted if v <= max_distance]
# Create Graph
G_to_root = nx.DiGraph()
G_to_root.add_node(root_node)
for k,v in to_root_sorted:
nx.add_path(G_to_root,nx.dijkstra_path(G,root_node,k, "proximity"))
# Determine Positions For Each Node
def get_width(n):
# Base Case: Leaf of Tree
if (not G_to_root[n]):
return 1
else:
width = 0
for neighbor in G_to_root[n].keys():
width += get_width(neighbor)
return width
pos = {}
def get_position(root=root_node, start=0., width=100.):
pos[root] = (start + width / 2.,-1.*to_root[root])
neighbors = sorted(G_to_root[root].keys(), key=lambda k: to_root[k])
widths = dict((n,get_width(n)) for n in neighbors)
total_width = sum(widths.values())
s = start
for n in neighbors:
get_position(n, s, widths[n] / total_width * width)
s += widths[n] / total_width * width
get_position()
# Plot on matplotlib
plt.figure(figsize=(30,12))
ax = plt.subplot(111)
plt.title("Proximity to " + root_node)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
plt.xticks([])
plt.ylabel("Proximity to " + root_node)
plt.yticks(range(0,-1 * max_distance,-20),range(0, max_distance,20))
# Plot edges
highlighted_nodes = [ k for k in G_to_root.nodes() if vertices[k] <= max_size_highlight and k not in exclude_highlight_nodes]
def plot_edges(root=root_node):
for n in G_to_root[root].keys():
plot_edges(n)
is_highlighted = any(nx.has_path(G_to_root,n,hn) for hn in highlighted_nodes)
plt.plot([pos[root][0],pos[n][0],pos[n][0]],
[pos[root][1],pos[root][1],pos[n][1]],
color=_d.get_color(0) if is_highlighted else _d.fade_color(_d.txt_color,0.3),
linewidth= 2. if is_highlighted else 1.)
plot_edges()
# Plot vertices
def plot_vertices():
for n, p in pos.items():
if (n in highlighted_nodes):
bg_color = _d.get_color(0)
color = _d.bg_color
prefix = " "
pad = 2
else:
if (any(nx.has_path(G_to_root,n,hn) for hn in highlighted_nodes)):
color = _d.get_color(0)
else:
color = _d.fade_color(_d.txt_color,0.5)
bg_color = _d.bg_color + "dd"
prefix = ""
pad = 1
plt.text(p[0],p[1],prefix + n,
size = 25,
rotation=0 if n in horizontal_nodes else 90,
ha='center',va='center',
bbox={'facecolor': bg_color,'edgecolor':'none','pad': pad},
color=color)
plot_vertices()
plt.show()
The tree above shows entities which are within 150 distances away from God. Unsurprisingly, a lot of the major characters in the bible have close relationships with God (Jesus, Moses, David, Abraham).
However, we also notice some lesser-known entities which are closely connected to God:
Another important application of social network analysis is the ability to understand which individuals yield the most influence. For example, suppose you would like someone to post an endorsement video on YouTube. Which individual should you seek to get the highest number of views? Typically, these individuals should be closely connected to a large group of people and have many ardent followers.
Similarly, we would like to investigate which biblical entities hold a strong influence. To calculate influence, let us first assume that we pass a message to A. How many other entities would receive the message? This would likely depend on the proximity of A to the others in the network. To translate this into code, we first calculate the shortest paths from A to every other entity. The shortest paths are then transformed into the probability of receiving the message via the exponential function. These probabilities, one for each node, are then summed to obtain the expected number of entities receiving the message, which we define as Influence Level.
After calculating the influence level of each node, we subsequently cluster them using Mean Shift Algorithm to produce the chart below:
# We assume that for evey 50 distances, the chances of information being passed on gets reduced
# by 50%
half_life = 50
inf_decay = math.log(2) / half_life
influence = {}
for k, v in proximity_to.items():
influence[k] = sum(math.exp(-1. * inf_decay * v1) for _, v1 in v.items())
# Cluster influences using MeanShift (with flat kernel and adjusted bandwidth)
#https://spin.atomicobject.com/2015/05/26/mean-shift-clustering/
#https://stackoverflow.com/questions/35094454/how-would-one-use-kernel-density-estimation-as-a-1d-clustering-method-in-scikit
sorted_influence = sorted(influence.items(),key=operator.itemgetter(1), reverse=True)
ms = MeanShift(bandwidth=2.5)
ms.fit(np.array([v for _,v in sorted_influence]).reshape(-1,1))
cluster_ind = ms.labels_
# Reorder clusters based on influence
cluster_set = []
for c_i in cluster_ind:
if c_i not in cluster_set:
cluster_set.insert(0,c_i)
cluster_ind = [cluster_set.index(c_i) for c_i in cluster_ind]
clusters = {}
# Add Groupings
for c_i in np.unique(cluster_ind):
clusters[c_i] = [sorted_influence[i] for i in range(len(sorted_influence)) if cluster_ind[i] == c_i]
# Create Color Palette for Clusters
color_pal = _d.get_color("palette")(len(cluster_set)-1)
def get_cluster_color(group):
if (group == 0):
return _d.fade_color(_d.ltxt_color,0.1)
else:
return _d.fade_color(_d.get_color(0),0.2 + (1.0-0.2) * group / max(cluster_set))
# Plot
plt.figure(figsize=(30,6))
ax = plt.subplot(111)
plt.title("Influential Entities")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.yticks([])
plt.ylim([-1.2, 1.2])
plt.xlim([45, 0])
plt.xlabel("Influence Level")
entities_to_highlight = ["Hebron","Joab","Ahab","Zedekiah","Asa","Jehoshaphat"]
for c_i in np.unique(cluster_ind):
cluster_values = [v for _,v in clusters[c_i]]
# Plot Lines
if (c_i == 0):
min_x = min(cluster_values)
plt.arrow(min_x,0,-0.05,0,
head_width=0.05, head_length=0.5,
edgecolor=get_cluster_color(c_i),
facecolor=get_cluster_color(c_i))
else:
min_x = min(cluster_values)-(min(cluster_values) - max(v for _,v in clusters[c_i-1]))/2. + 0.1
plt.plot([min_x, min_x],[-0.1,0.1], color=get_cluster_color(c_i))
if (c_i == max(cluster_ind)):
max_x = max(cluster_values)
plt.arrow(max_x-0.5,0,0.05,0,
head_width=0.05, head_length=0.5,
edgecolor=get_cluster_color(c_i),
facecolor=get_cluster_color(c_i))
else:
max_x = max(cluster_values) + (min(v for _,v in clusters[c_i+1])-max(cluster_values))/2. - 0.1
plt.plot([max_x, max_x],[-0.1,0.1], color=get_cluster_color(c_i))
plt.plot([min_x, max_x],[0,0],color=get_cluster_color(c_i))
# Plot Text
cluster_texts = [k for k,_ in clusters[c_i]][:10]
cutoff = math.ceil(len(cluster_texts)/2)
for i in range(len(cluster_texts)):
if cluster_texts[i] in entities_to_highlight:
bg_color = get_cluster_color(c_i)
fg_color = _d.bg_color
else:
bg_color = _d.bg_color
fg_color = get_cluster_color(c_i)
plt.text((min_x + max_x)/2.,
(cutoff - i - (1 if i >= cutoff else 0))*0.2,
cluster_texts[i],
size = 25,
ha='center',va='center',
bbox={'facecolor': bg_color,'edgecolor':'none','pad': 2},
color=fg_color)
plt.show()
Unsurprisingly, Jehovah, God and Jesus belong to the top 2 most influential groups in the bible. However, as in the previous section, we noticed some minor entities which are somewhat influential (highlighted above):
A community is a group of individuals that interact frequently with one another. In real life, this can be your family, colleagues, or friends. Similarly, we would like to discover the existence of such communities in the bible. We suspect that there will be at least one large community due to God's connections, as well as some isolated ones. For the purposes of this study, such extremely small and large communities will be excluded from the analysis.
Listed below are some communities of interest we detected through label propagation techniques:
# Detect Community Using Label Propagation
def get_communities():
random.seed(241)
return nx.algorithms.community.asyn_lpa_communities(G.subgraph(n for n, v in vertices.items() if v>=5), weight="weight")
communities = _h.cache(get_communities, "all_communities")
output = []
for d in communities:
if 5 <= len(d) and len(d) < 10:
output.append(d)
output = sorted(output, key=lambda l: len(l))
for d in output:
print("Community " + str(output.index(d) + 1) + ": " + ", ".join(sorted(d)))
ind = 3
plt.figure(figsize=(10,5))
nx.draw_circular(G.subgraph(output[ind-1]),
with_labels=True,
node_color=_d.bg_color,
node_size = 1000,
node_shape = 's',
edge_color = _d.fade_color(_d.txt_color, 0.2),
font_family = _d.def_font,
font_size = 15,
font_color = _d.txt_color
)
plt.show()
By detecting entities using Part-Of-Speech tagging and determining connections through word proximities, we have successfully constructed a social network for the bible. A quick dive into the network also provides insights of minor characters that is often overlooked:
The approach described in this project is relatively simple, and by no means perfect. To construct the network more accurately, we will need to build a more robust entity-detection algorithm, as well as a more intricate edge definition. Some future works for this project include: