Commit 4c49b267 authored by Jim Hoekstra's avatar Jim Hoekstra 👋🏻
Browse files

Merge branch 'develop' into 'master'

Merge develop into master

See merge request !13
parents beeb826a c921e7ad
# -*- coding: utf-8 -*-
Created on Wed Mar 31 12:09:38 2021
@author: Julian Bianco-Martinez
Graph Expansion by Intersection of Concepts (GEIC)
import pandas as pd
import numpy as np
def concepts_extraction_list(lst1):
Extract concepts from word2vec output
lst3 = [value[0] for value in lst1]
return lst3
def GEIC(concepts, words_to_exclude = [], topX_intersect_concepts = 5, topXsimilarConcepts = 100, threshold = 0.5 ):
From a set of concepts = C0, this algorithm collects and weight similar concepts that intersect with concepts in C0
concepts: Original Concepts (C0)
words_to_exclude: The code will remove new concepts that are duplicated in this list. If it is empty words_to_exclude = concepts
topX_intersect_concepts: Only collect the top X concepts found from the intersection of similar concepts between pairs of original concepts C0. ie.
C0(i) int C0(j) = IC(i,j)[::topX_intersect_concepts]
topXsimilarConcepts: Retrieve top X similar concepts per concept in C0
threshold: cut of threshold that retrieves only intersected concepts that apear in theshold * 100 percent of all combine C0 pairs.
Data frame with 4 columns:
Column 1: C0(i)
Column 2: C0(j)
Column 3: IC(i,j)[::topX_intersect_concepts]
Column 4: Importance of the intersected concept defined as the percentage of time the concept appear in pairwise combination of C0 concepts.
if len(words_to_exclude) == 0: # words_to_exclude = concepts
words_to_exclude = concepts.copy()
df_temp = pd.DataFrame({'Concept 1' : [], 'Concept 2' :[], 'Intersection' : []})
# Creation of triangular data (due to symmetry).
for i in range(len(concepts)-1):
for j in range(i+1, len(concepts)):
concepts1 = concepts_extraction_list(model.most_similar(concepts[i], topn=topXsimilarConcepts))
concepts2 = concepts_extraction_list(model.most_similar(concepts[j], topn=topXsimilarConcepts))
inter = [v for v in concepts1 if v in concepts2][0:topX_intersect_concepts]
if len(inter) > 0:
df_temp = df_temp.append(pd.DataFrame({'Concept 1' : concepts[i],
'Concept 2' : concepts[j],
'Intersection' : inter}))
df_extension = df_temp
#[v for v in df_extention['Intersect'].values if v not in words_to_exclude]
#Remove words that contain less than 4 characters.
logical_temp = [True if len(v) > 3 else False for v in df_extension['Intersection'].values ]
df_extension = df_extension[logical_temp]
#Weight Creation
weights = df_temp['Intersection'].value_counts().rename_axis(['Intersection']).reset_index(name='weight')
weights['weight'] = weights['weight'] /(0.5 * np.math.factorial(len(concepts))/(np.math.factorial(len(concepts)-2) )) #Use for normalization.
logical_temp = [True if v not in words_to_exclude else False for v in df_extension['Intersection'].values ]
df_extension = df_extension[logical_temp]
df_extension = df_extension.merge(weights, on = "Intersection")
df_extension = df_extension.loc[df_extension['weight'] > threshold]
concepts = ['king', 'queen', 'prince']
GEIC(concepts, topX_intersect_concepts = 15)
# -*- coding: utf-8 -*-
Staring date: 09-02-21
Code for MSX: Extension of candidate words.
words_to_sets: Check existent word2vec model for words similarities. Also
prune the similar words by string distances.
Input: set of words
Output: dictionary with original + similar words + similarity scores
sets_to_network: Converts part of the output of words_to_sets function into 2 objects
to be loaded in dash-cytoscape.
Input: dictionary with original + similar words
Ouput: two arrays to be used in dash-cyto
Codes include the use of an existent
word2vec pretrained model and string/substring distance.
Author: Julian Bianco Martinez
### Loading model
import gensim.downloader as api
# Different Pre-trained Embeddings
#model = api.load('word2vec-google-news-300')
model = api.load('glove-twitter-200')
model.most_similar('people', topn=100)
terms = """Jargon
Interactive tool
words_set = terms
#%% Functions
def words_to_sets(words_set):
# Similar words
from tqdm import tqdm
import stringdist as sdi
terms = words_set.lower().split("\n")
terms = [word.replace(' ','_') for word in terms]
similar_terms = dict()
words_not_vocabulary = []
for term in tqdm(terms):
temp_tuple = model.most_similar(term, topn=20)
for i,j in temp_tuple:
r += 1
temp_tuple[r] = temp_tuple[r] + (sdi.rdlevenshtein_norm(term,i),) + (i.find(term),)
similar_terms[term] = temp_tuple
similar_terms[term] = [(term, score, string_dist, substring) for term, score, string_dist, substring in similar_terms[term] if term == term.lower()
and "www" not in term
and string_dist > 0.5
and substring == -1][:10]
except (KeyError) as e:
return similar_terms, words_not_vocabulary
def sets_to_network(similar_terms):
### CytoScape structure Creation
edges = []
nodes = []
for item in similar_terms:
if item not in nodes:
nodes.append({"data": {"id": item, "label": item},"classes" : 'followerNode'})
for i in similar_terms[item]:
if i not in nodes:
nodes.append({"data": {"id": i[0], "label": i[0]},"classes" : 'followingNode'})
temp = {'data':{
'source': item,
'target': i[0]
return nodes, edges
similar_terms, words_not_vocabulary = words_to_sets(terms)
nodes, edges = sets_to_network(similar_terms)
### END Functions
#%% Dash
import dash
import dash_cytoscape as cyto
import dash_html_components as html
#####Cyto Code
app = dash.Dash(__name__)
# define layout
default_stylesheet = [
"selector": 'node',
"label": "data(label)"
'selector': '.followerNode',
'style': {
'background-color': '#00AAD9'
app.layout = html.Div([
#style={'width': '100%', 'height': '400px'},
layout = {'name': 'cose'},
elements = edges + nodes,
if __name__ == '__main__':
\ No newline at end of file
import dash
import os
from dash_app.layout import external_stylesheets
app = dash.Dash(name=__name__, external_stylesheets=external_stylesheets, url_base_pathname='/msx/',
URL_PREFIX = '/msx/'
if os.getenv('MSX_URL_PREFIX', False):
app = dash.Dash(name=__name__, external_stylesheets=external_stylesheets, url_base_pathname=URL_PREFIX,
......@@ -34,16 +34,17 @@ def update_base_word(submit_word_button, base_word_input):
Output(component_id='graph-elements-div', component_property='children'),
Input(component_id='submit-word-button', component_property='n_clicks'),
Input(component_id='add-word-button', component_property='n_clicks'),
Input(component_id='extend-graph-button', component_property='n_clicks'),
Input(component_id='remove-word-button', component_property='n_clicks'),
State(component_id='base-word-input', component_property='value'),
State(component_id='add-word-input', component_property='value'),
State(component_id='graph-elements-div', component_property='children'),
State(component_id='msx-graph', component_property='tapNodeData'),
State(component_id='msx-graph', component_property='selectedNodeData'),
State(component_id='base-word-div', component_property='children'),
def update_graph_elements(submit_word_button, add_word_button, remove_word_button, base_word_input, add_word_input,
nodes_and_edges, selected_nodes, base_word_state):
def update_graph_elements(submit_word_button, add_word_button, extend_graph_button, remove_word_button, base_word_input,
add_word_input, nodes_and_edges, selected_nodes, base_word_state):
callback_context = dash.callback_context
button_id = callback_context.triggered[0]['prop_id'].split('.')[0]
......@@ -65,16 +66,24 @@ def update_graph_elements(submit_word_button, add_word_button, remove_word_butto
raise PreventUpdate
if button_id == 'extend-graph-button':
graph.extend_graph(word2vec_model, base_word_state)
new_nodes_and_edges = graph.get_nodes_and_edges()
return json.dumps(new_nodes_and_edges)
if button_id == 'remove-word-button':
if selected_nodes is not None:
selected_word = selected_nodes['label']
if selected_word in graph.get_all_words() and selected_word != base_word_state:
new_nodes_and_edges = json.dumps(graph.get_nodes_and_edges())
return new_nodes_and_edges
raise PreventUpdate
if len(selected_nodes) > 0:
for selected_node in selected_nodes:
selected_word = selected_node['id']
if selected_word in graph.get_all_words() and selected_word != base_word_state:
raise PreventUpdate
new_nodes_and_edges = json.dumps(graph.get_nodes_and_edges())
return new_nodes_and_edges
raise PreventUpdate
......@@ -93,8 +102,14 @@ def update_graph(nodes_and_edges):
Output(component_id='msx-graph', component_property='autoRefreshLayout'),
Input(component_id='add-word-button', component_property='n_clicks'),
Input(component_id='extend-graph-button', component_property='n_clicks'),
Input(component_id='remove-word-button', component_property='n_clicks'),
def set_auto_refresh_layout(add_word_button, remove_word_button):
return False
def set_auto_refresh_layout(add_word_button, extend_graph_button, remove_word_button):
callback_context = dash.callback_context
button_id = callback_context.triggered[0]['prop_id'].split('.')[0]
if button_id == 'add-word-button' or button_id == 'remove-word-button':
return False
if button_id == 'extend-graph-button':
return True
......@@ -6,6 +6,8 @@ class Graph:
def __init__(self):
self.nodes = []
self.edges = []
self.MAX_NUM_WORDS = 10
def get_all_words(self):
all_words = [node_dict['data']['label'] for node_dict in self.nodes]
......@@ -99,3 +101,22 @@ class Graph:
def extend_graph(self, word2vec_model, base_node):
current_words = [node['data']['id'] for node in self.nodes]
all_associated_words = []
for current_word in current_words:
associated_words = word2vec_model.get_associated_words(current_word, top_n=100)
associated_words_filtered = [word for word in all_associated_words if word not in current_words]
associated_words_count = {word: associated_words_filtered.count(word) for word in list(set(associated_words_filtered))}
count_threshold = self.COUNT_THRESHOLD
common_associated_words = [word for word, count in associated_words_count.items() if count >= count_threshold]
while len(common_associated_words) > self.MAX_NUM_WORDS:
count_threshold += 1
common_associated_words = [word for word, count in associated_words_count.items() if count >= count_threshold]
self.add_edges(base_node, common_associated_words)
......@@ -44,7 +44,10 @@ layout = html.Div(children=[
html.Div(className='col-2', children=[
html.Button(id='add-word-button', n_clicks_timestamp=0, children='Add Association', className='btn btn-success btn-lg'),
html.Div(className='col-4', children=[
html.Div(className='col-2', children=[
html.Button(id='extend-graph-button', n_clicks_timestamp=0, children='Extend Graph', className='btn btn-success btn-lg'),
html.Div(className='col-2', children=[
html.Div(className='col-3', children=[
html.Button(id='remove-word-button', n_clicks_timestamp=0, children='Remove Selected Association', className='btn btn-danger btn-lg')]),
......@@ -5,13 +5,12 @@ import stringdist as sdi
class AssociatedWords:
def __init__(self):
self.N_RESULTS = 10
print("\n Word2Vec model is loading.This can take a couple of minutes.")
print("\n Word2Vec model is loading. This can take a couple of minutes.")
self.model = api.load('glove-twitter-200')
print(" Word2Vec model is ready. Enjoy!!!\n")
def get_associated_words(self, word):
gensim_result = self.model.most_similar(word, topn=self.N_RESULTS)
def get_associated_words(self, word, top_n=10):
gensim_result = self.model.most_similar(word, topn=top_n)
# gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
words = self.filter_results(gensim_result, word)
return words
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment