Source code for revelionn.concept_extraction
import networkx as nx
from semantic_loss_pytorch import SemanticLoss
[docs]
class ConceptExtractor:
"""
A class that provides concept extraction algorithms.
Attributes
----------
ontology : nxontology.NXOntology
Ontology represented as a graph, where edge direction goes from superterm to subterm.
trainer : MappingTrainer
An instance of the MappingTrainer class that provides an interface for training mapping networks.
Methods
-------
create_subgraph(graph, node)
Returns a subgraph containing all child nodes for a given, including this one.
order_concepts(target_concept, ontology)
Performs topological sorting of a subgraph formed by a given parent node (target concept).
exhaustive_search(concept, layer_names, mapping_neurons)
Trains and evaluates mapping networks based on the activations of each of the specified layers of the network.
linear_search(concept, top_layer_num, patience_layers, mapping_neurons)
Trains and evaluates mapping networks based on the activations of each of the layers starting from the
specified one, until the value of the quality metric deteriorates over several layers (the value of patience).
heuristic_search(target_concept, top_layer_num, patience_layers, mapping_neurons)
Due to the heuristic reduction of the set of specified layers, mapping networks are not trained for every
combination of layer-concept. Uses linear search.
simultaneous_extraction(target_concept, decoder_channels, num_shared_neurons, num_output_neurons,
sdd_path=None, vtree_path=None, sem_loss_weight=None, unlabeled_samples=None)
Trains a mapping network that can simultaneously extract a set of relevant concepts from the entire set of
layers of specified types (the types are set when initializing the MappingTrainer instance).
"""
def __init__(self, mapping_trainer, nxonto):
"""
Sets all the necessary attributes for the ConceptExtractor object.
Parameters
----------
mapping_trainer : MappingTrainer
An instance of the MappingTrainer class that provides an interface for training mapping networks.
nxonto : nxontology.NXOntology
Ontology represented as a graph, where edge direction goes from superterm to subterm.
"""
self.ontology = nxonto
self.trainer = mapping_trainer
[docs]
@staticmethod
def create_subgraph(graph, node):
"""
Returns a subgraph containing all child nodes for a given node, including the given node.
Parameters
----------
graph : networkx.Graph
The graph from which to extract the subgraph.
node : str
The node for which to create the subgraph.
Returns
-------
networkx.Graph
A subgraph of `graph` containing all child nodes of `node`, including `node`.
"""
edges = nx.dfs_successors(graph, node)
nodes = []
for k, v in edges.items():
nodes.extend([k])
nodes.extend(v)
return graph.subgraph(nodes)
[docs]
def order_concepts(self, target_concept, ontology):
"""
Performs topological sorting of a subgraph formed by a given parent node (target concept).
Parameters
----------
target_concept : str
The target concept node for which to perform topological sorting.
ontology : nxontology.NXOntology
The ontology graph.
Returns
-------
list
A list of concepts in topologically sorted order within the subgraph.
"""
subgraph = self.create_subgraph(ontology.graph, target_concept)
return list(nx.topological_sort(nx.line_graph(subgraph)))
[docs]
def exhaustive_search(self, concept, layer_names, mapping_neurons):
"""
Trains and evaluates mapping networks based on the activations of each of the specified layers of the network.
Parameters
----------
concept : str
The concept for which to perform the search.
layer_names : list
A list of layer names to consider for training and evaluation.
mapping_neurons : list[int]
The number of neurons in the mapping network.
Returns
-------
dict
A dict containing the best layer name and the corresponding evaluation value.
"""
best_value = None
best_layer = None
for layer_name in layer_names:
self.trainer.train_single_model(mapping_neurons, concept, [layer_name])
cur_value = self.trainer.evaluate_model()
if best_value is None or cur_value > best_value:
best_value = cur_value
best_layer = layer_name
return best_layer, best_value
[docs]
def linear_search(self, concept, top_layer_num, patience_layers, mapping_neurons):
"""
Trains and evaluates mapping networks based on the activations of each of the layers starting from the
specified one, until the value of the quality metric deteriorates over several layers (the value of patience).
Parameters
----------
concept : str
The concept for which to perform the search.
top_layer_num : int
The starting layer number for training and evaluation.
patience_layers : int
The number of layers to tolerate deterioration in the quality metric.
mapping_neurons : list[int]
The number of neurons in the mapping network.
Returns
-------
tuple
A tuple containing the best layer number and the corresponding evaluation value.
"""
best_layer_num = None
best_value = None
cur_layer_num = top_layer_num
while cur_layer_num >= 0:
self.trainer.train_single_model(mapping_neurons, concept, [cur_layer_num])
cur_value = self.trainer.evaluate_model()
if best_value is None or cur_value > best_value:
best_value = cur_value
best_layer_num = cur_layer_num
if best_layer_num - cur_layer_num > patience_layers:
break
cur_layer_num -= 1
return best_layer_num, best_value
[docs]
def heuristic_search(self, target_concept, top_layer_num, patience_layers, mapping_neurons):
"""
Due to the heuristic reduction of the set of specified layers, mapping networks are not trained for every
combination of layer-concept. Uses linear search.
Parameters
----------
target_concept : str
The target concept that should be obtained by ontological inference.
Mapping networks are trained to extract concepts relevant to the target concept.
top_layer_num : int
The starting layer number for training and evaluation.
patience_layers : int
The number of layers to tolerate deterioration in the quality metric.
mapping_neurons : list[int]
The number of neurons in the mapping network.
Returns
-------
dict
A dictionary containing the best layer number and evaluation value for each concept in the subgraph.
"""
ordered_concepts = self.order_concepts(target_concept, self.ontology)
best_layers = {}
for parent, child in ordered_concepts:
if parent == target_concept:
initial_layer = top_layer_num
else:
initial_layer = best_layers[parent][0]
layer_num, auc = self.linear_search(child, initial_layer, patience_layers, mapping_neurons)
if child not in best_layers.keys():
best_layers[child] = [layer_num, auc]
elif auc > best_layers[child][1]:
best_layers[child] = [layer_num, auc]
return best_layers
[docs]
def simultaneous_extraction(self, target_concept, decoder_channels, num_shared_neurons, num_output_neurons,
sdd_path=None, vtree_path=None, sem_loss_weight=None, unlabeled_samples=None):
"""
Parameters
----------
target_concept : str
The target concept that should be obtained by ontological inference.
Mapping networks are trained to extract concepts relevant to the target concept.
decoder_channels : int
The number of decoder channels. The output number of channels of the convolutional layer of the decoder or
the output number of neurons of the decoder of the fully connected layer.
num_shared_neurons : list[int]
The number of neurons in consecutive fully connected layers of the common part of the network
(internal representation of the simultaneous extraction network).
num_output_neurons : list[int]
The number of neurons in consecutive fully connected layers of each of the concept blocks.
sdd_path : str
The path to the .sdd file.
vtree_path : str
The path to the .vtree file.
sem_loss_weight : float
The contribution of semantic loss to the overall loss function.
unlabeled_samples : int or float
The number of unlabeled samples to include. If float, it represents the fraction of unlabeled samples.
Returns
-------
concepts_auc : list[float]
ROC AUC values for each of the concepts.
all_auc : float
ROC AUC value for all labels of a simultaneous mapping network.
"""
concepts = self.create_subgraph(self.ontology.graph, target_concept)
concepts.remove(target_concept)
if sdd_path is None:
self.trainer.train_simultaneous_model(concepts, decoder_channels, num_shared_neurons,
num_output_neurons)
else:
sl = SemanticLoss(sdd_path, vtree_path)
self.trainer.train_simultaneous_model_semisupervised(concepts, decoder_channels, num_shared_neurons,
num_output_neurons, sl, sem_loss_weight,
unlabeled_samples)
concepts_auc, all_auc = self.trainer.evaluate_model()
return concepts_auc, all_auc