Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
FoodInformatics
msx-tool
Commits
d74e6bc2
Commit
d74e6bc2
authored
Apr 09, 2021
by
Jim Hoekstra
👋🏻
Browse files
issues MSX-29 and MSX-30
parent
beeb826a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Julian_BM/w2v_stringdist_cyto.py
deleted
100644 → 0
View file @
beeb826a
# -*- coding: utf-8 -*-
"""
Staring date: 09-02-21
Code for MSX: Extension of candidate words.
Functions:
words_to_sets: Check existent word2vec model for words similarities. Also
prune the similar words by string distances.
Input: set of words
Output: dictionary with original + similar words + similarity scores
sets_to_network: Converts part of the output of words_to_sets function into 2 objects
to be loaded in dash-cytoscape.
Input: dictionary with original + similar words
Ouput: two arrays to be used in dash-cyto
Codes include the use of an existent
word2vec pretrained model and string/substring distance.
Author: Julian Bianco Martinez
"""
### Loading model
import
gensim.downloader
as
api
# Different Pre-trained Embeddings
#model = api.load('word2vec-google-news-300')
model
=
api
.
load
(
'glove-twitter-200'
)
model
.
most_similar
(
'people'
,
topn
=
100
)
terms
=
"""Jargon
Domain
Standardisation
Interactive tool
word2vec
NLP
Collaboration
Vocabulary
Graph
Meaning
Tool
Communication
People
Terms
Associations
Efficient"""
words_set
=
terms
#%% Functions
def
words_to_sets
(
words_set
):
# Similar words
from
tqdm
import
tqdm
import
stringdist
as
sdi
terms
=
words_set
.
lower
().
split
(
"
\n
"
)
terms
=
[
word
.
replace
(
' '
,
'_'
)
for
word
in
terms
]
similar_terms
=
dict
()
words_not_vocabulary
=
[]
for
term
in
tqdm
(
terms
):
try
:
temp_tuple
=
model
.
most_similar
(
term
,
topn
=
20
)
r
=-
1
for
i
,
j
in
temp_tuple
:
r
+=
1
temp_tuple
[
r
]
=
temp_tuple
[
r
]
+
(
sdi
.
rdlevenshtein_norm
(
term
,
i
),)
+
(
i
.
find
(
term
),)
similar_terms
[
term
]
=
temp_tuple
similar_terms
[
term
]
=
[(
term
,
score
,
string_dist
,
substring
)
for
term
,
score
,
string_dist
,
substring
in
similar_terms
[
term
]
if
term
==
term
.
lower
()
and
"www"
not
in
term
and
string_dist
>
0.5
and
substring
==
-
1
][:
10
]
except
(
KeyError
)
as
e
:
words_not_vocabulary
.
append
(
term
)
return
similar_terms
,
words_not_vocabulary
def
sets_to_network
(
similar_terms
):
### CytoScape structure Creation
edges
=
[]
nodes
=
[]
for
item
in
similar_terms
:
if
item
not
in
nodes
:
nodes
.
append
({
"data"
:
{
"id"
:
item
,
"label"
:
item
},
"classes"
:
'followerNode'
})
for
i
in
similar_terms
[
item
]:
if
i
not
in
nodes
:
nodes
.
append
({
"data"
:
{
"id"
:
i
[
0
],
"label"
:
i
[
0
]},
"classes"
:
'followingNode'
})
temp
=
{
'data'
:{
'source'
:
item
,
'target'
:
i
[
0
]
}
}
edges
.
append
(
temp
)
return
nodes
,
edges
similar_terms
,
words_not_vocabulary
=
words_to_sets
(
terms
)
nodes
,
edges
=
sets_to_network
(
similar_terms
)
### END Functions
#%% Dash
import
dash
import
dash_cytoscape
as
cyto
import
dash_html_components
as
html
#####Cyto Code
app
=
dash
.
Dash
(
__name__
)
# define layout
default_stylesheet
=
[
{
"selector"
:
'node'
,
"style"
:{
"label"
:
"data(label)"
}
},
{
'selector'
:
'.followerNode'
,
'style'
:
{
'background-color'
:
'#00AAD9'
}
}
]
app
.
layout
=
html
.
Div
([
cyto
.
Cytoscape
(
id
=
'test'
,
#style={'width': '100%', 'height': '400px'},
layout
=
{
'name'
:
'cose'
},
elements
=
edges
+
nodes
,
stylesheet
=
default_stylesheet
)
])
if
__name__
==
'__main__'
:
app
.
run_server
(
debug
=
False
)
\ No newline at end of file
dash_app/callbacks.py
View file @
d74e6bc2
...
...
@@ -34,6 +34,7 @@ def update_base_word(submit_word_button, base_word_input):
Output
(
component_id
=
'graph-elements-div'
,
component_property
=
'children'
),
Input
(
component_id
=
'submit-word-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'add-word-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'extend-graph-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'remove-word-button'
,
component_property
=
'n_clicks'
),
State
(
component_id
=
'base-word-input'
,
component_property
=
'value'
),
State
(
component_id
=
'add-word-input'
,
component_property
=
'value'
),
...
...
@@ -42,8 +43,8 @@ def update_base_word(submit_word_button, base_word_input):
State
(
component_id
=
'base-word-div'
,
component_property
=
'children'
),
prevent_initial_call
=
True
)
def
update_graph_elements
(
submit_word_button
,
add_word_button
,
remove_word_button
,
base_word_input
,
add_word_input
,
nodes_and_edges
,
selected_nodes
,
base_word_state
):
def
update_graph_elements
(
submit_word_button
,
add_word_button
,
extend_graph_button
,
remove_word_button
,
base_word_input
,
add_word_input
,
nodes_and_edges
,
selected_nodes
,
base_word_state
):
callback_context
=
dash
.
callback_context
button_id
=
callback_context
.
triggered
[
0
][
'prop_id'
].
split
(
'.'
)[
0
]
...
...
@@ -65,6 +66,12 @@ def update_graph_elements(submit_word_button, add_word_button, remove_word_butto
else
:
raise
PreventUpdate
if
button_id
==
'extend-graph-button'
:
graph
.
set_nodes_and_edges
(
json
.
loads
(
nodes_and_edges
))
graph
.
extend_graph
(
word2vec_model
,
base_word_state
)
new_nodes_and_edges
=
graph
.
get_nodes_and_edges
()
return
json
.
dumps
(
new_nodes_and_edges
)
if
button_id
==
'remove-word-button'
:
graph
.
set_nodes_and_edges
(
json
.
loads
(
nodes_and_edges
))
if
selected_nodes
is
not
None
:
...
...
@@ -93,8 +100,14 @@ def update_graph(nodes_and_edges):
@
app
.
callback
(
Output
(
component_id
=
'msx-graph'
,
component_property
=
'autoRefreshLayout'
),
Input
(
component_id
=
'add-word-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'extend-graph-button'
,
component_property
=
'n_clicks'
),
Input
(
component_id
=
'remove-word-button'
,
component_property
=
'n_clicks'
),
prevent_initial_call
=
True
)
def
set_auto_refresh_layout
(
add_word_button
,
remove_word_button
):
return
False
def
set_auto_refresh_layout
(
add_word_button
,
extend_graph_button
,
remove_word_button
):
callback_context
=
dash
.
callback_context
button_id
=
callback_context
.
triggered
[
0
][
'prop_id'
].
split
(
'.'
)[
0
]
if
button_id
==
'add-word-button'
or
button_id
==
'remove-word-button'
:
return
False
if
button_id
==
'extend-graph-button'
:
return
True
dash_app/graph.py
View file @
d74e6bc2
...
...
@@ -6,6 +6,8 @@ class Graph:
def
__init__
(
self
):
self
.
nodes
=
[]
self
.
edges
=
[]
self
.
COUNT_THRESHOLD
=
2
self
.
MAX_NUM_WORDS
=
10
def
get_all_words
(
self
):
all_words
=
[
node_dict
[
'data'
][
'label'
]
for
node_dict
in
self
.
nodes
]
...
...
@@ -99,3 +101,21 @@ class Graph:
}
]
)
def
extend_graph
(
self
,
word2vec_model
,
base_node
):
current_words
=
[
node
[
'data'
][
'id'
]
for
node
in
self
.
nodes
]
all_associated_words
=
[]
for
current_word
in
current_words
:
associated_words
=
word2vec_model
.
get_associated_words
(
current_word
,
top_n
=
100
)
all_associated_words
.
extend
(
associated_words
)
associated_words_filtered
=
[
word
for
word
in
all_associated_words
if
word
not
in
current_words
]
associated_words_count
=
{
word
:
associated_words_filtered
.
count
(
word
)
for
word
in
list
(
set
(
associated_words_filtered
))}
common_associated_words
=
[
word
for
word
,
count
in
associated_words_count
.
items
()
if
count
>=
self
.
COUNT_THRESHOLD
]
while
len
(
common_associated_words
)
>
self
.
MAX_NUM_WORDS
:
self
.
COUNT_THRESHOLD
+=
1
common_associated_words
=
[
word
for
word
,
count
in
associated_words_count
.
items
()
if
count
>=
self
.
COUNT_THRESHOLD
]
self
.
add_nodes
(
common_associated_words
)
self
.
add_edges
(
base_node
,
common_associated_words
)
dash_app/layout.py
View file @
d74e6bc2
...
...
@@ -44,7 +44,10 @@ layout = html.Div(children=[
html
.
Div
(
className
=
'col-2'
,
children
=
[
html
.
Button
(
id
=
'add-word-button'
,
n_clicks_timestamp
=
0
,
children
=
'Add Association'
,
className
=
'btn btn-success btn-lg'
),
]),
html
.
Div
(
className
=
'col-4'
,
children
=
[
html
.
Div
(
className
=
'col-2'
,
children
=
[
html
.
Button
(
id
=
'extend-graph-button'
,
n_clicks_timestamp
=
0
,
children
=
'Extend Graph'
,
className
=
'btn btn-success btn-lg'
),
]),
html
.
Div
(
className
=
'col-2'
,
children
=
[
]),
html
.
Div
(
className
=
'col-3'
,
children
=
[
html
.
Button
(
id
=
'remove-word-button'
,
n_clicks_timestamp
=
0
,
children
=
'Remove Selected Association'
,
className
=
'btn btn-danger btn-lg'
)]),
...
...
dash_app/words.py
View file @
d74e6bc2
...
...
@@ -5,13 +5,12 @@ import stringdist as sdi
class
AssociatedWords
:
def
__init__
(
self
):
self
.
N_RESULTS
=
10
print
(
"
\n
Word2Vec model is loading.This can take a couple of minutes."
)
self
.
model
=
api
.
load
(
'glove-twitter-200'
)
print
(
" Word2Vec model is ready. Enjoy!!!
\n
"
)
def
get_associated_words
(
self
,
word
):
gensim_result
=
self
.
model
.
most_similar
(
word
,
topn
=
self
.
N_RESULTS
)
def
get_associated_words
(
self
,
word
,
top_n
=
10
):
gensim_result
=
self
.
model
.
most_similar
(
word
,
topn
=
top_n
)
# gensim_result = [('apple', 1.0), ('banana', 1.0), ('strawberry', 1.0)]
words
=
self
.
filter_results
(
gensim_result
,
word
)
return
words
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment