Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
FoodInformatics
msx-tool
Commits
c921e7ad
Commit
c921e7ad
authored
Apr 14, 2021
by
Jim Hoekstra
👋🏻
Browse files
Merge branch 'issue/MSX-34' into 'develop'
Issue MSX-34 See merge request
!12
parents
82674525
39a7510d
Changes
3
Hide whitespace changes
Inline
Side-by-side
JBM_Intersection/Graph_Expantion_by_Intersection.py
deleted
100644 → 0
View file @
82674525
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 31 12:09:38 2021
@author: Julian Bianco-Martinez
Graph Expansion by Intersection of Concepts (GEIC)
"""
import
pandas
as
pd
import
numpy
as
np
def
concepts_extraction_list
(
lst1
):
'''
Extract concepts from word2vec output
'''
lst3
=
[
value
[
0
]
for
value
in
lst1
]
return
lst3
def
GEIC
(
concepts
,
words_to_exclude
=
[],
topX_intersect_concepts
=
5
,
topXsimilarConcepts
=
100
,
threshold
=
0.5
):
'''
From a set of concepts = C0, this algorithm collects and weight similar concepts that intersect with concepts in C0
Inputs:
concepts: Original Concepts (C0)
words_to_exclude: The code will remove new concepts that are duplicated in this list. If it is empty words_to_exclude = concepts
topX_intersect_concepts: Only collect the top X concepts found from the intersection of similar concepts between pairs of original concepts C0. ie.
C0(i) int C0(j) = IC(i,j)[::topX_intersect_concepts]
topXsimilarConcepts: Retrieve top X similar concepts per concept in C0
threshold: cut of threshold that retrieves only intersected concepts that apear in theshold * 100 percent of all combine C0 pairs.
Output:
Data frame with 4 columns:
Column 1: C0(i)
Column 2: C0(j)
Column 3: IC(i,j)[::topX_intersect_concepts]
Column 4: Importance of the intersected concept defined as the percentage of time the concept appear in pairwise combination of C0 concepts.
'''
if
len
(
words_to_exclude
)
==
0
:
# words_to_exclude = concepts
words_to_exclude
=
concepts
.
copy
()
df_temp
=
pd
.
DataFrame
({
'Concept 1'
:
[],
'Concept 2'
:[],
'Intersection'
:
[]})
# Creation of triangular data (due to symmetry).
for
i
in
range
(
len
(
concepts
)
-
1
):
for
j
in
range
(
i
+
1
,
len
(
concepts
)):
concepts1
=
concepts_extraction_list
(
model
.
most_similar
(
concepts
[
i
],
topn
=
topXsimilarConcepts
))
concepts2
=
concepts_extraction_list
(
model
.
most_similar
(
concepts
[
j
],
topn
=
topXsimilarConcepts
))
inter
=
[
v
for
v
in
concepts1
if
v
in
concepts2
][
0
:
topX_intersect_concepts
]
if
len
(
inter
)
>
0
:
df_temp
=
df_temp
.
append
(
pd
.
DataFrame
({
'Concept 1'
:
concepts
[
i
],
'Concept 2'
:
concepts
[
j
],
'Intersection'
:
inter
}))
df_extension
=
df_temp
#[v for v in df_extention['Intersect'].values if v not in words_to_exclude]
#Remove words that contain less than 4 characters.
logical_temp
=
[
True
if
len
(
v
)
>
3
else
False
for
v
in
df_extension
[
'Intersection'
].
values
]
df_extension
=
df_extension
[
logical_temp
]
#Weight Creation
weights
=
df_temp
[
'Intersection'
].
value_counts
().
rename_axis
([
'Intersection'
]).
reset_index
(
name
=
'weight'
)
weights
[
'weight'
]
=
weights
[
'weight'
]
/
(
0.5
*
np
.
math
.
factorial
(
len
(
concepts
))
/
(
np
.
math
.
factorial
(
len
(
concepts
)
-
2
)
))
#Use for normalization.
logical_temp
=
[
True
if
v
not
in
words_to_exclude
else
False
for
v
in
df_extension
[
'Intersection'
].
values
]
df_extension
=
df_extension
[
logical_temp
]
df_extension
=
df_extension
.
merge
(
weights
,
on
=
"Intersection"
)
df_extension
=
df_extension
.
loc
[
df_extension
[
'weight'
]
>
threshold
]
return
(
df_extension
)
#Example
concepts
=
[
'king'
,
'queen'
,
'prince'
]
GEIC
(
concepts
,
topX_intersect_concepts
=
15
)
dash_app/callbacks.py
View file @
c921e7ad
...
...
@@ -39,7 +39,7 @@ def update_base_word(submit_word_button, base_word_input):
State
(
component_id
=
'base-word-input'
,
component_property
=
'value'
),
State
(
component_id
=
'add-word-input'
,
component_property
=
'value'
),
State
(
component_id
=
'graph-elements-div'
,
component_property
=
'children'
),
State
(
component_id
=
'msx-graph'
,
component_property
=
'
tap
NodeData'
),
State
(
component_id
=
'msx-graph'
,
component_property
=
'
selected
NodeData'
),
State
(
component_id
=
'base-word-div'
,
component_property
=
'children'
),
prevent_initial_call
=
True
)
...
...
@@ -74,14 +74,16 @@ def update_graph_elements(submit_word_button, add_word_button, extend_graph_butt
if
button_id
==
'remove-word-button'
:
graph
.
set_nodes_and_edges
(
json
.
loads
(
nodes_and_edges
))
if
selected_nodes
is
not
None
:
selected_word
=
selected_nodes
[
'label'
]
if
selected_word
in
graph
.
get_all_words
()
and
selected_word
!=
base_word_state
:
graph
.
remove_node
(
selected_nodes
[
'label'
])
new_nodes_and_edges
=
json
.
dumps
(
graph
.
get_nodes_and_edges
())
return
new_nodes_and_edges
else
:
raise
PreventUpdate
if
len
(
selected_nodes
)
>
0
:
for
selected_node
in
selected_nodes
:
selected_word
=
selected_node
[
'id'
]
if
selected_word
in
graph
.
get_all_words
()
and
selected_word
!=
base_word_state
:
graph
.
remove_node
(
selected_word
)
else
:
raise
PreventUpdate
new_nodes_and_edges
=
json
.
dumps
(
graph
.
get_nodes_and_edges
())
return
new_nodes_and_edges
else
:
raise
PreventUpdate
...
...
dash_app/words.py
View file @
c921e7ad
...
...
@@ -5,7 +5,7 @@ import stringdist as sdi
class
AssociatedWords
:
def
__init__
(
self
):
print
(
"
\n
Word2Vec model is loading.This can take a couple of minutes."
)
print
(
"
\n
Word2Vec model is loading.
This can take a couple of minutes."
)
self
.
model
=
api
.
load
(
'glove-twitter-200'
)
print
(
" Word2Vec model is ready. Enjoy!!!
\n
"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment