From d7f28ef0c842d6390b1926d296824dfab46be2b5 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Fri, 23 Aug 2024 15:37:27 +0200 Subject: [PATCH 01/12] WIP: standard reasoners added --- examples/incomplete_kb.py | 58 ++++++++++++++++++++ examples/retrieval_eval.py | 13 +++-- examples/retrieval_eval_under_incomplete.py | 60 ++++++++++++++++++--- ontolearn/owl_neural_reasoner.py | 2 +- ontolearn/utils/static_funcs.py | 4 ++ 5 files changed, 125 insertions(+), 12 deletions(-) diff --git a/examples/incomplete_kb.py b/examples/incomplete_kb.py index 926bd2ef..1cab25f8 100644 --- a/examples/incomplete_kb.py +++ b/examples/incomplete_kb.py @@ -3,6 +3,64 @@ +# def make_kb_incomplete(kb_path, output_path, rate, seed): +# """ +# Makes the knowledge base incomplete by removing a certain percentage of statements (triples). + +# Inputs: +# --------------- + +# kb_path: Path to the input knowledge base. +# output_path: Path to save the modified (incomplete) knowledge base. +# rate: Percentage of statements to remove (0-100). +# seed: random seed for reproducibility. + +# Output: +# --------------- + +# Incomplete KB at level rate % +# """ + +# random.seed(seed) + +# # Load the ontology +# kb = get_ontology(kb_path).load() + +# # Get all individuals in the ontology +# all_individuals = list(kb.individuals()) + +# # Collect all triples (subject-predicate-object) related to the individuals +# all_triples = [] +# for individual in all_individuals: +# for prop in individual.get_properties(): +# for value in prop[individual]: +# all_triples.append((individual, prop, value)) + +# # Calculate the number of triples to remove based on the rate +# num_to_remove = int(len(all_triples) * (rate / 100)) + +# # Randomly select triples to remove +# triples_to_remove = random.sample(all_triples, num_to_remove) + +# print(len(triples_to_remove)) +# # exit(0) + +# # Remove the selected triples +# for subject, predicate, obj in triples_to_remove: + + + +# predicate[subject].remove(obj) + + + +# # Save the modified ontology to a new file +# kb.save(file=output_path, format="rdfxml") + + + + + def make_kb_incomplete(kb_path, output_path, rate, seed): """ Makes the knowledge base incomplete by removing a certain percentage of individuals. diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index 1e1f80ec..c5abf39c 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -60,12 +60,13 @@ def execute(args): if args.ratio_sample_nc: # (6.1) Subsample if required. nc = {i for i in random.sample(population=list(nc), k=max(1, int(len(nc) * args.ratio_sample_nc)))} + # (7) NC⁻: Complement of NC. nnc = {i.get_object_complement_of() for i in nc} # (8) UNNC: NC UNION NC⁻. unnc = nc.union(nnc) # (9) Retrieve 10 random Nominals. - nominals = set(random.sample(symbolic_kb.all_individuals_set(), 10)) + nominals = set(random.sample(symbolic_kb.all_individuals_set(), 3)) # (10) All Combinations of 3 for Nominals. nominal_combinations = set( OWLObjectOneOf(combination) @@ -130,11 +131,11 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Converted to list so that the progress bar works. concepts = list( chain( - nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, - exist_unnc, for_all_unnc, + # nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, + # exist_unnc, for_all_unnc, min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, - exist_nominals, + # exist_nominals, ) ) # () Shuffled the data so that the progress bar is not influenced by the order of concepts. @@ -152,6 +153,8 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Compute the F1-score. f1_sim = f1_set_similarity(retrieval_y, retrieval_neural_y) # () Store the data. + # print(expression) + # exit(0) data.append( { "Expression": owl_expression_to_dl(expression), @@ -169,7 +172,7 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: ) # () Read the data into pandas dataframe df = pd.DataFrame(data) - assert df["Jaccard Similarity"].mean() == 1.0 + # assert df["Jaccard Similarity"].mean() == 1.0 # () Save the experimental results into csv file. df.to_csv(args.path_report) del df diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index 0d0370bb..15a5d0ec 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -13,6 +13,10 @@ from owlapy.iri import IRI from owlapy.parser import DLSyntaxParser import ast +from owlapy import owl_expression_to_dl +from owlapy.owl_ontology_manager import OntologyManager +from owlapy.owlapi_adaptor import OWLAPIAdaptor +import pandas as pd # [] Create sub/incomplete KGs @@ -38,8 +42,10 @@ def generated_incomplete_kg(kb_path: str, directory: str, n: int, ratio: float) # output path for the incomplete KGs output_path = f'{directory}/incomplete_{name}_ratio_{rate}_number_{i}.owl' - # function to generate the incomplete KG - make_kb_incomplete(kb_path, output_path, rate, seed=i) + # Check if the file already exists + if not os.path.exists(output_path): + # If file does not exist, generate it + make_kb_incomplete(kb_path, output_path, rate, seed=i) # Add the output path to the set file_paths.add(output_path) @@ -55,6 +61,7 @@ def execute(args): parser = DLSyntaxParser(namespace) + # TODO: What should be directory args.path_kg? name_KG = args.path_kg.split('/')[-1].split('.')[0] @@ -69,6 +76,7 @@ def execute(args): for path_of_an_incomplete_kgs in paths_of_incomplete_kgs: + data = [] list_jaccard_symbolic = [] list_jaccard_neural = [] @@ -85,20 +93,43 @@ def execute(args): else: assert expressions == {i for i in df["Expression"].to_list()} +#---------------------------------------------------------------------------------------------------------------- + + ontology_path = path_of_an_incomplete_kgs + # Available OWL Reasoners: 'HermiT', 'Pellet', 'JFact', 'Openllet' + + reasoners = ['HermiT', 'Pellet', 'JFact', 'Openllet'] + + owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner="JFact") + # Iterate over defined owl Classes in the signature + # for i in onto.classes_in_signature(): + # print(i) + # exit(0) + # # Performing type inference with Pellet + # instances=owlapi_adaptor.instances(i,direct=False) + # print(f"Class:{i}\t Num instances:{len(instances)}") + # owlapi_adaptor.stopJVM() + + # exit(0) + +#------------------------------------------------------------------------------------------------------------------ + # Iterate for expression in expressions: # TODO: str -> owlapy.owl_classexpression object target_concept = parser.parse_expression(expression) - + goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} result_symbolic: Set[str] result_neural_symbolic: Set[str] - result_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval"].apply(ast.literal_eval) - result_symbolic = result_symbolic.iloc[0] + # result_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval"].apply(ast.literal_eval) + # result_symbolic = result_symbolic.iloc[0] + + result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept,direct=False))} result_neural_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval) result_neural_symbolic = result_neural_symbolic.iloc[0] @@ -112,13 +143,30 @@ def execute(args): list_jaccard_neural.append(jaccard_sim_neural) list_jaccard_symbolic.append(jaccard_sim_symbolic) + data.append( + { + "Expression": expression, + "Type": type(expression).__name__, + "Jaccard_sym": jaccard_sim_symbolic, + "Jaccard_EBR": jaccard_sim_neural, + # "Runtime Benefits": runtime_y - runtime_neural_y, + # "Symbolic_Retrieval": retrieval_y, + # "Symbolic_Retrieval_Neural": retrieval_neural_y, + } + ) + + + df = pd.DataFrame(data=data) + + print(df) + avg_jaccard_sym = sum(list_jaccard_symbolic)/len(list_jaccard_symbolic) avg_jaccard_neural = sum(list_jaccard_neural)/len(list_jaccard_neural) print("Average jaccard symbolic", avg_jaccard_sym) print("Average Jaccard neural", avg_jaccard_neural) - + owlapi_adaptor.stopJVM() #stop the standard reasoner def get_default_arguments(): parser = ArgumentParser() diff --git a/ontolearn/owl_neural_reasoner.py b/ontolearn/owl_neural_reasoner.py index 8eec4af9..b54cf88e 100644 --- a/ontolearn/owl_neural_reasoner.py +++ b/ontolearn/owl_neural_reasoner.py @@ -44,7 +44,7 @@ def __init__(self, path_of_kb: str = None, path_of_kb = path_of_kb.replace("/", "_") path_of_kb = path_of_kb.replace(".", "_") args.path_to_store_single_run = path_of_kb - args.num_epochs = 500 + args.num_epochs = 100 args.embedding_dim = 512 args.batch_size = 1024 args.backend = "rdflib" diff --git a/ontolearn/utils/static_funcs.py b/ontolearn/utils/static_funcs.py index 4b08f978..b48e1c55 100644 --- a/ontolearn/utils/static_funcs.py +++ b/ontolearn/utils/static_funcs.py @@ -67,6 +67,10 @@ def f1_set_similarity(y: Set[str], yhat: Set[str]) -> float: precision = len(y.intersection(yhat)) / len(y) recall = len(y.intersection(yhat)) / len(yhat) + + if precision == 0 and recall == 0: + return 0.0 + return (2 * precision * recall) / (precision + recall) From 9bb0d7a50383390e4412e2a9479d31afd0b3e8d3 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Mon, 26 Aug 2024 16:27:06 +0200 Subject: [PATCH 02/12] WIP: Evaluation against SOTA --- examples/retrieval_eval.py | 4 +- examples/retrieval_eval_under_incomplete.py | 213 +++++++++++++------- 2 files changed, 139 insertions(+), 78 deletions(-) diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index c5abf39c..efe68119 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -131,8 +131,8 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Converted to list so that the progress bar works. concepts = list( chain( - # nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, - # exist_unnc, for_all_unnc, + nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, + exist_unnc, for_all_unnc, min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, # exist_nominals, diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index 15a5d0ec..b08a94bf 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -52,121 +52,182 @@ def generated_incomplete_kg(kb_path: str, directory: str, n: int, ratio: float) return file_paths - def execute(args): - symbolic_kb = KnowledgeBase(path=args.path_kg) - namespace = list(symbolic_kb.ontology.classes_in_signature())[0].iri.get_namespace() - parser = DLSyntaxParser(namespace) - - - # TODO: What should be directory args.path_kg? name_KG = args.path_kg.split('/')[-1].split('.')[0] - - directory = f"incomplete_{name_KG}" - - paths_of_incomplete_kgs = generated_incomplete_kg(kb_path=args.path_kg, directory=directory,\ - n=args.number_of_incomplete_graphs, ratio=args.level_of_incompleteness) - - # TODO: make sure the number of triple match inside - # TODO: ensure all triples are subset of the original KG + level_of_incompleteness_str = str(args.level_of_incompleteness).replace('.', '_') + directory = f"incomplete_{name_KG}_{level_of_incompleteness_str}" + paths_of_incomplete_kgs = generated_incomplete_kg( + kb_path=args.path_kg, + directory=directory, + n=args.number_of_incomplete_graphs, + ratio=args.level_of_incompleteness + ) + expressions = None + all_results = [] for path_of_an_incomplete_kgs in paths_of_incomplete_kgs: - - data = [] - list_jaccard_symbolic = [] list_jaccard_neural = [] + data = [] - # Train a KGE, retrieval eval vs KGE and Symbolic - # args.ratio_sample_nc - # args.ratio_sample_object_prob subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs]) - # Load the results on the current view. df = pd.read_csv("ALCQHI_Retrieval_Results.csv", index_col=0) - # Sanity checking if expressions is None: expressions = {i for i in df["Expression"].to_list()} else: assert expressions == {i for i in df["Expression"].to_list()} -#---------------------------------------------------------------------------------------------------------------- - ontology_path = path_of_an_incomplete_kgs - # Available OWL Reasoners: 'HermiT', 'Pellet', 'JFact', 'Openllet' - reasoners = ['HermiT', 'Pellet', 'JFact', 'Openllet'] + reasoner_jaccards = {reasoner: [] for reasoner in reasoners} - owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner="JFact") - # Iterate over defined owl Classes in the signature - # for i in onto.classes_in_signature(): - # print(i) - # exit(0) - # # Performing type inference with Pellet - # instances=owlapi_adaptor.instances(i,direct=False) - # print(f"Class:{i}\t Num instances:{len(instances)}") - # owlapi_adaptor.stopJVM() + for expression in expressions: + target_concept = parser.parse_expression(expression) + goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} + result_neural_symbolic = df[df["Expression"] == expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval).iloc[0] + jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) + list_jaccard_neural.append(jaccard_sim_neural) + + result_row = { + "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], + "Expression": expression, + "Type": type(parser.parse_expression(expression)).__name__, + "Jaccard_EBR": jaccard_sim_neural + } - # exit(0) + for reasoner in reasoners: + owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner=reasoner) + result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept, direct=False))} + jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) + reasoner_jaccards[reasoner].append(jaccard_sim_symbolic) + result_row[f"Jaccard_{reasoner}"] = jaccard_sim_symbolic -#------------------------------------------------------------------------------------------------------------------ + data.append(result_row) + + all_results.extend(data) + + avg_jaccard_neural = sum(list_jaccard_neural) / len(list_jaccard_neural) + avg_jaccard_reasoners = {reasoner: sum(reasoner_jaccards[reasoner]) / len(reasoner_jaccards[reasoner]) for reasoner in reasoners} + + print(f"Average Jaccard neural ({path_of_an_incomplete_kgs}):", avg_jaccard_neural) + for reasoner, avg_jaccard in avg_jaccard_reasoners.items(): + print(f"Average Jaccard {reasoner} ({path_of_an_incomplete_kgs}):", avg_jaccard) + + # Create a final DataFrame from all results and write to a CSV file + final_df = pd.DataFrame(all_results) + final_csv_path = f"{directory}/comparison_results.csv" + final_df.to_csv(final_csv_path, index=False) + + print(final_df.head()) + print(f"Results have been saved to {final_csv_path}") + + owlapi_adaptor.stopJVM() # Stop the standard reasoner + + - # Iterate - for expression in expressions: - # TODO: str -> owlapy.owl_classexpression object - - target_concept = parser.parse_expression(expression) - goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} - result_symbolic: Set[str] - result_neural_symbolic: Set[str] +# def execute(args): - # result_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval"].apply(ast.literal_eval) - # result_symbolic = result_symbolic.iloc[0] +# symbolic_kb = KnowledgeBase(path=args.path_kg) + +# namespace = list(symbolic_kb.ontology.classes_in_signature())[0].iri.get_namespace() - result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept,direct=False))} +# parser = DLSyntaxParser(namespace) - result_neural_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval) - result_neural_symbolic = result_neural_symbolic.iloc[0] - +# name_KG = args.path_kg.split('/')[-1].split('.')[0] - jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) +# directory = f"incomplete_{name_KG}" - jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) +# paths_of_incomplete_kgs = generated_incomplete_kg(kb_path=args.path_kg, directory=directory,\ +# n=args.number_of_incomplete_graphs, ratio=args.level_of_incompleteness) + +# expressions = None - # Update for Averaging - list_jaccard_neural.append(jaccard_sim_neural) - list_jaccard_symbolic.append(jaccard_sim_symbolic) +# for path_of_an_incomplete_kgs in paths_of_incomplete_kgs: - data.append( - { - "Expression": expression, - "Type": type(expression).__name__, - "Jaccard_sym": jaccard_sim_symbolic, - "Jaccard_EBR": jaccard_sim_neural, - # "Runtime Benefits": runtime_y - runtime_neural_y, - # "Symbolic_Retrieval": retrieval_y, - # "Symbolic_Retrieval_Neural": retrieval_neural_y, - } - ) +# data = [] +# list_jaccard_symbolic = [] +# list_jaccard_neural = [] + +# subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs]) +# # Load the results on the current view. +# df = pd.read_csv("ALCQHI_Retrieval_Results.csv", index_col=0) + +# # Sanity checking +# if expressions is None: +# expressions = {i for i in df["Expression"].to_list()} +# else: +# assert expressions == {i for i in df["Expression"].to_list()} + +# #---------------------------------------------------------------------------------------------------------------- + +# # adding other reasoners for comparison + +# ontology_path = path_of_an_incomplete_kgs +# # Available OWL Reasoners: 'HermiT', 'Pellet', 'JFact', 'Openllet' + +# reasoners = ['HermiT', 'Pellet', 'JFact', 'Openllet'] + +# owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner="JFact") + +# #------------------------------------------------------------------------------------------------------------------ + +# # Iterate +# for expression in expressions: + + +# target_concept = parser.parse_expression(expression) + +# # print(target_concept) +# # exit(0) + +# # Compute the groundtruth +# goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} + +# result_symbolic: Set[str] +# result_neural_symbolic: Set[str] + +# # retrieval operation with other reasoners +# result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept,direct=False))} + +# # retrieval operation with ours (we just load from the csv data) +# result_neural_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval) +# result_neural_symbolic = result_neural_symbolic.iloc[0] - - df = pd.DataFrame(data=data) +# # Compute the Jaccard similarity +# jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) +# jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) + +# # Update for Averaging +# list_jaccard_neural.append(jaccard_sim_neural) +# list_jaccard_symbolic.append(jaccard_sim_symbolic) + +# data.append( +# { +# "Expression": expression, +# "Type": type(parser.parse_expression(expression)).__name__, +# "Jaccard_sym": jaccard_sim_symbolic, +# "Jaccard_EBR": jaccard_sim_neural, +# } +# ) + +# df = pd.DataFrame(data=data) - print(df) +# print(df) - avg_jaccard_sym = sum(list_jaccard_symbolic)/len(list_jaccard_symbolic) - avg_jaccard_neural = sum(list_jaccard_neural)/len(list_jaccard_neural) +# avg_jaccard_sym = sum(list_jaccard_symbolic)/len(list_jaccard_symbolic) +# avg_jaccard_neural = sum(list_jaccard_neural)/len(list_jaccard_neural) - print("Average jaccard symbolic", avg_jaccard_sym) - print("Average Jaccard neural", avg_jaccard_neural) +# print("Average jaccard symbolic", avg_jaccard_sym) +# print("Average Jaccard neural", avg_jaccard_neural) - owlapi_adaptor.stopJVM() #stop the standard reasoner +# owlapi_adaptor.stopJVM() #stop the standard reasoner def get_default_arguments(): parser = ArgumentParser() From 9e7eb507b2537edc4ee3e561da8897672ad13462 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Mon, 2 Sep 2024 13:01:38 +0200 Subject: [PATCH 03/12] Max cardinality error fixed --- examples/retrieval_eval.py | 15 +++++----- examples/retrieval_eval_under_incomplete.py | 32 +++++++++++++++------ ontolearn/owl_neural_reasoner.py | 1 + 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index efe68119..cf10873d 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -130,10 +130,10 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: data = [] # () Converted to list so that the progress bar works. concepts = list( - chain( - nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, - exist_unnc, for_all_unnc, - min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, + chain(for_all_unnc, + # nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, + # exist_unnc, for_all_unnc, + # min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, # exist_nominals, ) @@ -153,8 +153,6 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Compute the F1-score. f1_sim = f1_set_similarity(retrieval_y, retrieval_neural_y) # () Store the data. - # print(expression) - # exit(0) data.append( { "Expression": owl_expression_to_dl(expression), @@ -162,6 +160,7 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: "Jaccard Similarity": jaccard_sim, "F1": f1_sim, "Runtime Benefits": runtime_y - runtime_neural_y, + "Runtime Neural": runtime_neural_y, "Symbolic_Retrieval": retrieval_y, "Symbolic_Retrieval_Neural": retrieval_neural_y, } @@ -194,10 +193,10 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: def get_default_arguments(): parser = ArgumentParser() - parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl") + parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl") parser.add_argument("--path_kge_model", type=str, default=None) parser.add_argument("--endpoint_triple_store", type=str, default=None) - parser.add_argument("--gamma", type=float, default=0.8) + parser.add_argument("--gamma", type=float, default=0.9) parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index b08a94bf..7d101f78 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -73,19 +73,23 @@ def execute(args): list_jaccard_neural = [] data = [] + # subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--ratio_sample_nc","0.1", "--ratio_sample_object_prob", "0.1"]) + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs]) + df = pd.read_csv("ALCQHI_Retrieval_Results.csv", index_col=0) - - if expressions is None: - expressions = {i for i in df["Expression"].to_list()} - else: - assert expressions == {i for i in df["Expression"].to_list()} + + expressions = {i for i in df["Expression"].to_list()} ontology_path = path_of_an_incomplete_kgs reasoners = ['HermiT', 'Pellet', 'JFact', 'Openllet'] reasoner_jaccards = {reasoner: [] for reasoner in reasoners} + reasoner_times = {reasoner: [] for reasoner in reasoners} # To store running times for expression in expressions: + + print("-"*100) + print("Expression:", expression) target_concept = parser.parse_expression(expression) goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} result_neural_symbolic = df[df["Expression"] == expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval).iloc[0] @@ -96,15 +100,26 @@ def execute(args): "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], "Expression": expression, "Type": type(parser.parse_expression(expression)).__name__, - "Jaccard_EBR": jaccard_sim_neural + "Jaccard_EBR": jaccard_sim_neural, + "Runtime_EBR": df[df["Expression"] == expression]["Runtime Neural"].iloc[0] } for reasoner in reasoners: + + print(f"...Reasoner {reasoner} starts") + + start_time = time.time() # Start timing owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner=reasoner) result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept, direct=False))} + end_time = time.time() # End timing + + elapsed_time = end_time - start_time # Calculate elapsed time jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) reasoner_jaccards[reasoner].append(jaccard_sim_symbolic) + reasoner_times[reasoner].append(elapsed_time) # Store running time + result_row[f"Jaccard_{reasoner}"] = jaccard_sim_symbolic + result_row[f"Runtime_{reasoner}"] = elapsed_time data.append(result_row) @@ -112,10 +127,12 @@ def execute(args): avg_jaccard_neural = sum(list_jaccard_neural) / len(list_jaccard_neural) avg_jaccard_reasoners = {reasoner: sum(reasoner_jaccards[reasoner]) / len(reasoner_jaccards[reasoner]) for reasoner in reasoners} + avg_time_reasoners = {reasoner: sum(reasoner_times[reasoner]) / len(reasoner_times[reasoner]) for reasoner in reasoners} print(f"Average Jaccard neural ({path_of_an_incomplete_kgs}):", avg_jaccard_neural) for reasoner, avg_jaccard in avg_jaccard_reasoners.items(): print(f"Average Jaccard {reasoner} ({path_of_an_incomplete_kgs}):", avg_jaccard) + print(f"Average Runtime {reasoner} ({path_of_an_incomplete_kgs}):", avg_time_reasoners[reasoner]) # Create a final DataFrame from all results and write to a CSV file final_df = pd.DataFrame(all_results) @@ -126,7 +143,6 @@ def execute(args): print(f"Results have been saved to {final_csv_path}") owlapi_adaptor.stopJVM() # Stop the standard reasoner - @@ -231,7 +247,7 @@ def execute(args): def get_default_arguments(): parser = ArgumentParser() - parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl") + parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") diff --git a/ontolearn/owl_neural_reasoner.py b/ontolearn/owl_neural_reasoner.py index 3fd18aeb..d3a2044e 100644 --- a/ontolearn/owl_neural_reasoner.py +++ b/ontolearn/owl_neural_reasoner.py @@ -48,6 +48,7 @@ def __init__(self, path_of_kb: str = None, args.embedding_dim = 512 args.batch_size = 1024 args.backend = "rdflib" + args.trainer = "PL" reports = Execute(args).start() path_neural_embedding = reports["path_experiment_folder"] self.model = KGE(path=path_neural_embedding) From 2235e543d0c10622ca7905557da778874ec74507 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Wed, 18 Sep 2024 16:55:16 +0200 Subject: [PATCH 04/12] Experiments with inconsistencies --- examples/incomplete_kb.py | 212 +++++++++++++++- examples/retrieval_eval.py | 8 +- examples/retrieval_eval_under_incomplete.py | 261 ++++++++------------ 3 files changed, 319 insertions(+), 162 deletions(-) diff --git a/examples/incomplete_kb.py b/examples/incomplete_kb.py index 1cab25f8..7dbf2e0f 100644 --- a/examples/incomplete_kb.py +++ b/examples/incomplete_kb.py @@ -107,4 +107,214 @@ def make_kb_incomplete(kb_path, output_path, rate, seed): # kb_path = "KGs/Family/father.owl" # output_path = f"incomplete_father_{rate}.owl" -# make_kb_incomplete(kb_path, output_path, rate, seed) + +def make_kb_inconsistent(kb_path, output_path, rate, seed, max_attempts=100): + """ + This function makes the knowledge base (KB) inconsistent by introducing incorrect statements. + + Parameters: + kb_path (str): Path to the original OWL ontology file. + output_path (str): Path to save the inconsistent ontology file. + rate (float): Percentage of incorrect statements to introduce (0-100). + seed (int): Seed for reproducibility. + max_attempts (int): Maximum attempts to find a valid incorrect statement. + """ + + # Set the random seed for reproducibility + random.seed(seed) + + # Load the ontology + onto = get_ontology(kb_path).load() + + # Get all individuals, classes, and properties + all_individuals = list(onto.individuals()) + all_classes = list(onto.classes()) + all_object_properties = list(onto.object_properties()) + all_data_properties = list(onto.data_properties()) + + def count_triples(): + """Count the number of triples (statements) in the ontology.""" + return len(list(onto.world.sparql(""" + SELECT ?s ?p ?o + WHERE { + ?s ?p ?o . + } + """))) + + def generate_incorrect_class_assertion(individual): + """Generate an incorrect class assertion by adding a disjoint or contradictory class.""" + class_candidates = [cls for cls in all_classes if cls not in individual.is_a] + if not class_candidates: + return None + + selected_class = random.choice(class_candidates) + individual.is_a.append(selected_class) + print(f"Added incorrect class assertion: {individual} rdf:type {selected_class}") + return f"Added incorrect class assertion: {individual} rdf:type {selected_class}" + + def generate_incorrect_object_property(individual): + """Generate an incorrect object property assertion.""" + prop = random.choice(all_object_properties) + incorrect_object = random.choice(all_individuals) + + if incorrect_object not in prop[individual]: + prop[individual].append(incorrect_object) + print(f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}") + return f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}" + + def generate_incorrect_data_property(individual): + + """Generate an incorrect data property assertion (if exist in the KB).""" + if len(all_data_properties) != 0: + prop = random.choice(all_data_properties) + incorrect_value = "inconsistent_value" # Example of an incorrect data value + + if incorrect_value not in prop[individual]: + setattr(individual, prop.name, incorrect_value) + print(f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}") + return f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}" + + + + def insert_incorrect_statements(): + """Insert incorrect statements based on the specified rate.""" + num_triples = count_triples() # Use the total number of triples in the KB + num_incorrect = int(num_triples * (rate / 100)) + + # print(num_triples) + + incorrect_statements = [] + + for _ in range(num_incorrect): + attempts = 0 + while attempts < max_attempts: + individual = random.choice(all_individuals) + statement_type = random.choice(['class', 'object_property']) #could also add data properties later on + + if statement_type == 'class': + result = generate_incorrect_class_assertion(individual) + elif statement_type == 'object_property': + result = generate_incorrect_object_property(individual) + + + if result: + incorrect_statements.append(result) + break + + attempts += 1 + + # print(len(incorrect_statements)) + # exit(0) + + + return incorrect_statements + + # Insert incorrect statements + inconsistencies = insert_incorrect_statements() + + # Save the modified ontology + onto.save(file=output_path, format="rdfxml") + + # Return the list of inconsistencies added (for logging or debugging purposes) + return inconsistencies + + +# def make_kb_inconsistent(kb_path, output_path, rate, seed, max_attempts=100): + """ + This function makes the knowledge base (KB) inconsistent by introducing incorrect statements. + + Parameters: + kb_path (str): Path to the original OWL ontology file. + output_path (str): Path to save the inconsistent ontology file. + rate (float): Percentage of incorrect statements to introduce (0-100). + seed (int): Seed for reproducibility. + max_attempts (int): Maximum attempts to find a valid incorrect statement. + """ + + # Set the random seed for reproducibility + random.seed(seed) + + # Load the ontology + onto = get_ontology(kb_path).load() + + # Get all individuals, classes, and properties + all_individuals = list(onto.individuals()) + all_classes = list(onto.classes()) + all_object_properties = list(onto.object_properties()) + all_data_properties = list(onto.data_properties()) + + def generate_incorrect_class_assertion(individual): + """Generate an incorrect class assertion by adding a disjoint or contradictory class.""" + class_candidates = [cls for cls in all_classes if cls not in individual.is_a] + if not class_candidates: + return None + + selected_class = random.choice(class_candidates) + individual.is_a.append(selected_class) + print(f"Added incorrect class assertion: {individual} rdf:type {selected_class}") + return f"Added incorrect class assertion: {individual} rdf:type {selected_class}" + + def generate_incorrect_object_property(individual): + """Generate an incorrect object property assertion.""" + prop = random.choice(all_object_properties) + incorrect_object = random.choice(all_individuals) + + if incorrect_object not in prop[individual]: + prop[individual].append(incorrect_object) + print(f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}") + return f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}" + + def generate_incorrect_data_property(individual): + + """Generate an incorrect data property assertion (if exist in the KB).""" + if len(all_data_properties) != 0: + prop = random.choice(all_data_properties) + incorrect_value = "inconsistent_value" # Example of an incorrect data value + + if incorrect_value not in prop[individual]: + setattr(individual, prop.name, incorrect_value) + print(f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}") + return f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}" + + + + def insert_incorrect_statements(): + """Insert incorrect statements based on the specified rate.""" + num_statements = len(list(onto.individuals())) # Approximation for total statements + num_incorrect = int(num_statements * (rate / 100)) + + # print(num_incorrect) + # exit(0) + + incorrect_statements = [] + + for _ in range(num_incorrect): + attempts = 0 + while attempts < max_attempts: + individual = random.choice(all_individuals) + statement_type = random.choice(['class', 'object_property', 'data_property']) + + if statement_type == 'class': + result = generate_incorrect_class_assertion(individual) + elif statement_type == 'object_property': + result = generate_incorrect_object_property(individual) + elif statement_type == 'data_property': + result = generate_incorrect_data_property(individual) + + if result: + incorrect_statements.append(result) + break + + attempts += 1 + print(len(incorrect_statements)) + exit(0) + return incorrect_statements + + # Insert incorrect statements + inconsistencies = insert_incorrect_statements() + + # Save the modified ontology + onto.save(file=output_path, format="rdfxml") + + return inconsistencies + diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index cf10873d..bdfd22dd 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -130,10 +130,10 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: data = [] # () Converted to list so that the progress bar works. concepts = list( - chain(for_all_unnc, - # nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, - # exist_unnc, for_all_unnc, - # min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, + chain( + nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, + exist_unnc, for_all_unnc, + min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, # exist_nominals, ) diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index 7d101f78..d628b9f2 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -20,15 +20,7 @@ # [] Create sub/incomplete KGs -def generated_incomplete_kg(kb_path: str, directory: str, n: int, ratio: float) -> Set[str]: - - # (1) - # TODO:CD: Ensure that randomness can be controlled via seed - # TODO:CD: Return a set of strings where each item corresponds ot the local path of a sub kg. - - #e.g. of how the file can be save - # kb_path = "KGs/Family/father.owl" - # output_path = f"incomplete_father_ratio_10_number_1.owl" +def generated_incomplete_kg(kb_path: str, directory: str, n: int, ratio: float, operation: str) -> Set[str]: name = kb_path.split('/')[-1].split('.')[0] rate = int(ratio * 100) @@ -39,13 +31,24 @@ def generated_incomplete_kg(kb_path: str, directory: str, n: int, ratio: float) for i in range(1, n + 1): - # output path for the incomplete KGs - output_path = f'{directory}/incomplete_{name}_ratio_{rate}_number_{i}.owl' + + if "incomplete" in operation: + + # output path for the incomplete KGs + output_path = f'{directory}/{operation}_{name}_ratio_{rate}_number_{i}.owl' - # Check if the file already exists - if not os.path.exists(output_path): - # If file does not exist, generate it - make_kb_incomplete(kb_path, output_path, rate, seed=i) + # Check if the file already exists + if not os.path.exists(output_path): + # If file does not exist, generate it + make_kb_incomplete(kb_path, output_path, rate, seed=i) + + else: + output_path = f'{directory}/{operation}_{name}_ratio_{rate}_number_{i}.owl' + + # Check if the file already exists + if not os.path.exists(output_path): + # If file does not exist, generate it + make_kb_inconsistent(kb_path, output_path, rate, seed=i) # Add the output path to the set file_paths.add(output_path) @@ -58,14 +61,16 @@ def execute(args): parser = DLSyntaxParser(namespace) name_KG = args.path_kg.split('/')[-1].split('.')[0] level_of_incompleteness_str = str(args.level_of_incompleteness).replace('.', '_') - directory = f"incomplete_{name_KG}_{level_of_incompleteness_str}" + directory = f"{args.operation}_{name_KG}_{level_of_incompleteness_str}" paths_of_incomplete_kgs = generated_incomplete_kg( kb_path=args.path_kg, directory=directory, n=args.number_of_incomplete_graphs, - ratio=args.level_of_incompleteness + ratio=args.level_of_incompleteness, + operation=args.operation ) - + path_report = f"{directory}/ALCQHI_Retrieval_Results.csv" + expressions = None all_results = [] @@ -73,11 +78,11 @@ def execute(args): list_jaccard_neural = [] data = [] - # subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--ratio_sample_nc","0.1", "--ratio_sample_object_prob", "0.1"]) + # subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--ratio_sample_nc","0.02", "--ratio_sample_object_prob", "0.2", "--path_report", path_report]) - subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs]) + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--path_report", path_report]) - df = pd.read_csv("ALCQHI_Retrieval_Results.csv", index_col=0) + df = pd.read_csv(f"{directory}/ALCQHI_Retrieval_Results.csv", index_col=0) expressions = {i for i in df["Expression"].to_list()} @@ -86,168 +91,108 @@ def execute(args): reasoner_jaccards = {reasoner: [] for reasoner in reasoners} reasoner_times = {reasoner: [] for reasoner in reasoners} # To store running times - for expression in expressions: - - print("-"*100) - print("Expression:", expression) - target_concept = parser.parse_expression(expression) - goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} - result_neural_symbolic = df[df["Expression"] == expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval).iloc[0] - jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) - list_jaccard_neural.append(jaccard_sim_neural) - - result_row = { - "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], - "Expression": expression, - "Type": type(parser.parse_expression(expression)).__name__, - "Jaccard_EBR": jaccard_sim_neural, - "Runtime_EBR": df[df["Expression"] == expression]["Runtime Neural"].iloc[0] - } - - for reasoner in reasoners: - - print(f"...Reasoner {reasoner} starts") - - start_time = time.time() # Start timing - owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner=reasoner) - result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept, direct=False))} - end_time = time.time() # End timing - - elapsed_time = end_time - start_time # Calculate elapsed time - jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) - reasoner_jaccards[reasoner].append(jaccard_sim_symbolic) - reasoner_times[reasoner].append(elapsed_time) # Store running time - - result_row[f"Jaccard_{reasoner}"] = jaccard_sim_symbolic - result_row[f"Runtime_{reasoner}"] = elapsed_time - - data.append(result_row) - - all_results.extend(data) - - avg_jaccard_neural = sum(list_jaccard_neural) / len(list_jaccard_neural) - avg_jaccard_reasoners = {reasoner: sum(reasoner_jaccards[reasoner]) / len(reasoner_jaccards[reasoner]) for reasoner in reasoners} - avg_time_reasoners = {reasoner: sum(reasoner_times[reasoner]) / len(reasoner_times[reasoner]) for reasoner in reasoners} - - print(f"Average Jaccard neural ({path_of_an_incomplete_kgs}):", avg_jaccard_neural) - for reasoner, avg_jaccard in avg_jaccard_reasoners.items(): - print(f"Average Jaccard {reasoner} ({path_of_an_incomplete_kgs}):", avg_jaccard) - print(f"Average Runtime {reasoner} ({path_of_an_incomplete_kgs}):", avg_time_reasoners[reasoner]) - - # Create a final DataFrame from all results and write to a CSV file - final_df = pd.DataFrame(all_results) - final_csv_path = f"{directory}/comparison_results.csv" - final_df.to_csv(final_csv_path, index=False) - - print(final_df.head()) - print(f"Results have been saved to {final_csv_path}") - - owlapi_adaptor.stopJVM() # Stop the standard reasoner - - - + owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner='HermiT') -# def execute(args): + if owlapi_adaptor.has_consistent_ontology(): -# symbolic_kb = KnowledgeBase(path=args.path_kg) - -# namespace = list(symbolic_kb.ontology.classes_in_signature())[0].iri.get_namespace() - -# parser = DLSyntaxParser(namespace) - -# name_KG = args.path_kg.split('/')[-1].split('.')[0] - -# directory = f"incomplete_{name_KG}" - -# paths_of_incomplete_kgs = generated_incomplete_kg(kb_path=args.path_kg, directory=directory,\ -# n=args.number_of_incomplete_graphs, ratio=args.level_of_incompleteness) - -# expressions = None - -# for path_of_an_incomplete_kgs in paths_of_incomplete_kgs: - -# data = [] -# list_jaccard_symbolic = [] -# list_jaccard_neural = [] - -# subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs]) -# # Load the results on the current view. -# df = pd.read_csv("ALCQHI_Retrieval_Results.csv", index_col=0) - -# # Sanity checking -# if expressions is None: -# expressions = {i for i in df["Expression"].to_list()} -# else: -# assert expressions == {i for i in df["Expression"].to_list()} + for expression in expressions: + + print("-"*100) + print("Expression:", expression) + target_concept = parser.parse_expression(expression) + goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} + result_neural_symbolic = df[df["Expression"] == expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval).iloc[0] + jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) + list_jaccard_neural.append(jaccard_sim_neural) + + result_row = { + "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], + "Expression": expression, + "Type": type(parser.parse_expression(expression)).__name__, + "Jaccard_EBR": jaccard_sim_neural, + "Runtime_EBR": df[df["Expression"] == expression]["Runtime Neural"].iloc[0] + } -# #---------------------------------------------------------------------------------------------------------------- -# # adding other reasoners for comparison + for reasoner in reasoners: -# ontology_path = path_of_an_incomplete_kgs -# # Available OWL Reasoners: 'HermiT', 'Pellet', 'JFact', 'Openllet' + owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner=reasoner) -# reasoners = ['HermiT', 'Pellet', 'JFact', 'Openllet'] + print(f"...Reasoner {reasoner} starts") -# owlapi_adaptor = OWLAPIAdaptor(path=ontology_path, name_reasoner="JFact") + start_time = time.time() # Start timing -# #------------------------------------------------------------------------------------------------------------------ + result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept, direct=False))} + end_time = time.time() # End timing + + elapsed_time = end_time - start_time # Calculate elapsed time + jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) + reasoner_jaccards[reasoner].append(jaccard_sim_symbolic) + reasoner_times[reasoner].append(elapsed_time) # Store running time + + result_row[f"Jaccard_{reasoner}"] = jaccard_sim_symbolic + result_row[f"Runtime_{reasoner}"] = elapsed_time -# # Iterate -# for expression in expressions: + + data.append(result_row) -# target_concept = parser.parse_expression(expression) + all_results.extend(data) -# # print(target_concept) -# # exit(0) + + avg_jaccard_neural = sum(list_jaccard_neural) / len(list_jaccard_neural) + avg_jaccard_reasoners = {reasoner: sum(reasoner_jaccards[reasoner]) / len(reasoner_jaccards[reasoner]) for reasoner in reasoners} + avg_time_reasoners = {reasoner: sum(reasoner_times[reasoner]) / len(reasoner_times[reasoner]) for reasoner in reasoners} -# # Compute the groundtruth -# goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} + print(f"Average Jaccard neural ({path_of_an_incomplete_kgs}):", avg_jaccard_neural) + for reasoner, avg_jaccard in avg_jaccard_reasoners.items(): + print(f"Average Jaccard {reasoner} ({path_of_an_incomplete_kgs}):", avg_jaccard) + print(f"Average Runtime {reasoner} ({path_of_an_incomplete_kgs}):", avg_time_reasoners[reasoner]) -# result_symbolic: Set[str] -# result_neural_symbolic: Set[str] + else: -# # retrieval operation with other reasoners -# result_symbolic = {i.str for i in (owlapi_adaptor.instances(target_concept,direct=False))} + for expression in expressions: -# # retrieval operation with ours (we just load from the csv data) -# result_neural_symbolic = df[df["Expression"]==expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval) -# result_neural_symbolic = result_neural_symbolic.iloc[0] - -# # Compute the Jaccard similarity -# jaccard_sim_symbolic = jaccard_similarity(result_symbolic, goal_retrieval) -# jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) + print("-"*100) + print("Expression:", expression) + + target_concept = parser.parse_expression(expression) + goal_retrieval = {i.str for i in symbolic_kb.individuals(target_concept)} + result_neural_symbolic = df[df["Expression"] == expression]["Symbolic_Retrieval_Neural"].apply(ast.literal_eval).iloc[0] + jaccard_sim_neural = jaccard_similarity(result_neural_symbolic, goal_retrieval) + list_jaccard_neural.append(jaccard_sim_neural) + + result_row = { + "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], + "Expression": expression, + "Type": type(parser.parse_expression(expression)).__name__, + "Jaccard_EBR": jaccard_sim_neural, + "Runtime_EBR": df[df["Expression"] == expression]["Runtime Neural"].iloc[0] + } + -# # Update for Averaging -# list_jaccard_neural.append(jaccard_sim_neural) -# list_jaccard_symbolic.append(jaccard_sim_symbolic) + data.append(result_row) -# data.append( -# { -# "Expression": expression, -# "Type": type(parser.parse_expression(expression)).__name__, -# "Jaccard_sym": jaccard_sim_symbolic, -# "Jaccard_EBR": jaccard_sim_neural, -# } -# ) - -# df = pd.DataFrame(data=data) + all_results.extend(data) + print("The Knowledge base is not consistent, hence other reasoners will fail") -# print(df) + # Create a final DataFrame from all results and write to a CSV file + final_df = pd.DataFrame(all_results) + final_csv_path = f"{directory}/comparison_results.csv" + final_df.to_csv(final_csv_path, index=False) -# avg_jaccard_sym = sum(list_jaccard_symbolic)/len(list_jaccard_symbolic) -# avg_jaccard_neural = sum(list_jaccard_neural)/len(list_jaccard_neural) + print(final_df.head()) + print(f"Results have been saved to {final_csv_path}") + + owlapi_adaptor.stopJVM() # Stop the standard reasoner + -# print("Average jaccard symbolic", avg_jaccard_sym) -# print("Average Jaccard neural", avg_jaccard_neural) -# owlapi_adaptor.stopJVM() #stop the standard reasoner def get_default_arguments(): parser = ArgumentParser() - parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl") + parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") @@ -255,6 +200,8 @@ def get_default_arguments(): parser.add_argument("--number_of_incomplete_graphs", type = int, default=1) parser.add_argument("--level_of_incompleteness", type = float, default=0.1, \ help="Percentage of incompleteness from the original KGs between 0 and 1") + parser.add_argument("--operation", type = str, default= "incomplete", choices=["incomplete", "inconsistent"],\ + help = "Choose to make the KB incomplete or inconsistent") return parser.parse_args() From de6e73129cafd54794b252aef17d4ff06f02e9a7 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Mon, 30 Sep 2024 15:54:58 +0200 Subject: [PATCH 05/12] bash script added --- examples/retrieval_eval.py | 8 ++++++-- examples/retrieval_eval_under_incomplete.py | 4 ++-- ontolearn/owl_neural_reasoner.py | 1 + run_multiple_carcinogenesis.sh | 21 +++++++++++++++++++++ 4 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 run_multiple_carcinogenesis.sh diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index bdfd22dd..b5db22ed 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -57,6 +57,7 @@ def execute(args): object_properties_and_inverse = object_properties.union(object_properties_inverse) # (6) NC: Named owl concepts. nc = {i for i in symbolic_kb.get_concepts()} + if args.ratio_sample_nc: # (6.1) Subsample if required. nc = {i for i in random.sample(population=list(nc), k=max(1, int(len(nc) * args.ratio_sample_nc)))} @@ -65,6 +66,7 @@ def execute(args): nnc = {i.get_object_complement_of() for i in nc} # (8) UNNC: NC UNION NC⁻. unnc = nc.union(nnc) + # (9) Retrieve 10 random Nominals. nominals = set(random.sample(symbolic_kb.all_individuals_set(), 3)) # (10) All Combinations of 3 for Nominals. @@ -131,15 +133,17 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Converted to list so that the progress bar works. concepts = list( chain( - nc, unions, intersections, nnc, unnc, unions_unnc, intersections_unnc, + nc, unions, intersections, nnc, unions_unnc, intersections_unnc, exist_unnc, for_all_unnc, min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, - # exist_nominals, + exist_nominals, ) ) # () Shuffled the data so that the progress bar is not influenced by the order of concepts. + random.shuffle(concepts) + # () Iterate over single OWL Class Expressions in ALCQIHO for expression in (tqdm_bar := tqdm(concepts, position=0, leave=True)): retrieval_y: Set[str] diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index d628b9f2..84f4939e 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -192,14 +192,14 @@ def execute(args): def get_default_arguments(): parser = ArgumentParser() - parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl") + parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Incomplete_Results.csv") parser.add_argument("--number_of_incomplete_graphs", type = int, default=1) parser.add_argument("--level_of_incompleteness", type = float, default=0.1, \ - help="Percentage of incompleteness from the original KGs between 0 and 1") + help="Percentage of incompleteness or inconsistency from the original KG between 0 and 1") parser.add_argument("--operation", type = str, default= "incomplete", choices=["incomplete", "inconsistent"],\ help = "Choose to make the KB incomplete or inconsistent") return parser.parse_args() diff --git a/ontolearn/owl_neural_reasoner.py b/ontolearn/owl_neural_reasoner.py index d3a2044e..05231c34 100644 --- a/ontolearn/owl_neural_reasoner.py +++ b/ontolearn/owl_neural_reasoner.py @@ -49,6 +49,7 @@ def __init__(self, path_of_kb: str = None, args.batch_size = 1024 args.backend = "rdflib" args.trainer = "PL" + # args.save_embeddings_as_csv = "True" reports = Execute(args).start() path_neural_embedding = reports["path_experiment_folder"] self.model = KGE(path=path_neural_embedding) diff --git a/run_multiple_carcinogenesis.sh b/run_multiple_carcinogenesis.sh new file mode 100644 index 00000000..5bf7e17e --- /dev/null +++ b/run_multiple_carcinogenesis.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Define the path to your Python script +PYTHON_SCRIPT="examples/retrieval_eval_under_incomplete.py" +path_kg="KGs/Carcinogenesis/carcinogenesis.owl" + +# Define the number of incomplete graphs +NUMBER_OF_INCOMPLETE_GRAPHS=5 + +# Define the list of levels of incompleteness +LEVELS_OF_INCOMPLETENESS=("0.8" "0.9") +# LEVELS_OF_INCOMPLETENESS=("0.1") + +# Iterate over each level of incompleteness +for LEVEL in "${LEVELS_OF_INCOMPLETENESS[@]}"; do + echo "Running with level_of_incompleteness=$LEVEL..." + python $PYTHON_SCRIPT --number_of_incomplete_graphs $NUMBER_OF_INCOMPLETE_GRAPHS --level_of_incompleteness $LEVEL --path_kg $path_kg + echo "Completed with level_of_incompleteness=$LEVEL." +done + +echo "All tasks completed." From 6a74c67164b757d353c0ea3cabef6ebf7cefc939 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Tue, 1 Oct 2024 18:44:44 +0200 Subject: [PATCH 06/12] Readme file updated --- README.md | 357 ++++---------------- examples/retrieval_eval_under_incomplete.py | 14 +- run_multiple_carcinogenesis.sh | 3 +- 3 files changed, 76 insertions(+), 298 deletions(-) diff --git a/README.md b/README.md index 41b89199..dee7677f 100644 --- a/README.md +++ b/README.md @@ -1,334 +1,109 @@ -[![Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://ontolearn-docs-dice-group.netlify.app/usage/09_further_resources#code-coverage) -[![Pypi](https://img.shields.io/badge/pypi-0.7.3-blue)](https://pypi.org/project/ontolearn/0.7.3/) -[![Docs](https://img.shields.io/badge/documentation-0.7.3-yellow)](https://ontolearn-docs-dice-group.netlify.app/usage/01_introduction) -[![Python](https://img.shields.io/badge/python-3.10.13+-4584b6)](https://www.python.org/downloads/release/python-31013/) -  -![Ontolearn](docs/_static/images/Ontolearn_logo.png) +## Neural Description Logic Reasoning over incomplete Knowledge Base -# Ontolearn: Learning OWL Class Expression - -*Ontolearn* is an open-source software library for learning owl class expressions at large scale. - -Given positive and negative [OWL named individual](https://www.w3.org/TR/owl2-syntax/#Individuals) examples -$E^+$ and $E^-$, learning [OWL Class expression](https://www.w3.org/TR/owl2-syntax/#Class_Expressions) problem refers to the following supervised Machine Learning problem - -$$\forall p \in E^+\ \mathcal{K} \models H(p) \wedge \forall n \in E^-\ \mathcal{K} \not \models H(n).$$ - -To tackle this supervised learning problem, ontolearn offers many symbolic, neuro-symbolic and deep learning based Learning algorithms: -- **Drill** → [Neuro-Symbolic Class Expression Learning](https://www.ijcai.org/proceedings/2023/0403.pdf) -- **EvoLearner** → [EvoLearner: Learning Description Logics with Evolutionary Algorithms](https://dl.acm.org/doi/abs/10.1145/3485447.3511925) -- **NCES2** → (soon) [Neural Class Expression Synthesis in ALCHIQ(D)](https://papers.dice-research.org/2023/ECML_NCES2/NCES2_public.pdf) -- **NCES** → [Neural Class Expression Synthesis](https://link.springer.com/chapter/10.1007/978-3-031-33455-9_13) -- **NERO** → (soon) [Learning Permutation-Invariant Embeddings for Description Logic Concepts](https://link.springer.com/chapter/10.1007/978-3-031-30047-9_9) -- **CLIP** → [Learning Concept Lengths Accelerates Concept Learning in ALC](https://link.springer.com/chapter/10.1007/978-3-031-06981-9_14) -- **CELOE** → [Class Expression Learning for Ontology Engineering](https://www.sciencedirect.com/science/article/abs/pii/S1570826811000023) -- **OCEL** → A limited version of CELOE - -Find more in the [Documentation](https://ontolearn-docs-dice-group.netlify.app/usage/01_introduction). +This repository provides the implementation of our reasoner EBR. With this repository, once can perform instance retrieval even within an incomplete and inconsistent knowldege base. EBR leverages KGE to perform reasoning over incomplete and inconsistent knowledge bases (KBs). We employ a neural link predictor to facilitate the retrieval of missing data and handle inconsistencies. ## Installation + ```shell -pip install ontolearn -``` -or -```shell -git clone https://github.com/dice-group/Ontolearn.git +git clone https://github.com/dice-group/Ontolearn.git -b retrieval_eval_incomplete # To create a virtual python env with conda -conda create -n venv python=3.10.14 --no-default-packages && conda activate venv && pip install -e . +conda create -n venv python=3.10.14 --no-default-packages && conda activate venv && pip install -e . && cd Ontolearn # To download knowledge graphs wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip -# To download learning problems -wget https://files.dice-research.org/projects/Ontolearn/LPs.zip -O ./LPs.zip && unzip LPs.zip -``` - -## Learning OWL Class Expression -```python -from ontolearn.learners import TDL -from ontolearn.triple_store import TripleStore -from ontolearn.knowledge_base import KnowledgeBase -from ontolearn.learning_problem import PosNegLPStandard -from owlapy.owl_individual import OWLNamedIndividual -from owlapy import owl_expression_to_sparql, owl_expression_to_dl -# (1) Initialize Triplestore or KnowledgeBase -# sudo docker run -p 3030:3030 -e ADMIN_PASSWORD=pw123 stain/jena-fuseki -# Login http://localhost:3030/#/ with admin and pw123 and upload KGs/Family/family.owl -# kb = TripleStore(url="http://localhost:3030/family") -kb = KnowledgeBase(path="KGs/Family/father.owl") -# (2) Initialize a learner. -model = TDL(knowledge_base=kb, use_nominals=True) -# (3) Define a description logic concept learning problem. -lp = PosNegLPStandard(pos={OWLNamedIndividual("http://example.com/father#stefan")}, - neg={OWLNamedIndividual("http://example.com/father#heinz"), - OWLNamedIndividual("http://example.com/father#anna"), - OWLNamedIndividual("http://example.com/father#michelle")}) -# (4) Learn description logic concepts best fitting (3). -h = model.fit(learning_problem=lp).best_hypotheses() -print(h) -print(owl_expression_to_dl(h)) -print(owl_expression_to_sparql(expression=h)) -""" -OWLObjectSomeValuesFrom(property=OWLObjectProperty(IRI('http://example.com/father#','hasChild')),filler=OWLObjectOneOf((OWLNamedIndividual(IRI('http://example.com/father#','markus')),))) - -∃ hasChild.{markus} - -SELECT - DISTINCT ?x WHERE { -?x ?s_1 . - FILTER ( ?s_1 IN ( - - ) ) - } -""" -print(model.classification_report) -""" -Classification Report: Negatives: -1 and Positives 1 - precision recall f1-score support - - Negative 1.00 1.00 1.00 3 - Positive 1.00 1.00 1.00 1 - - accuracy 1.00 4 - macro avg 1.00 1.00 1.00 4 -weighted avg 1.00 1.00 1.00 4 -""" ``` +Other datasets used in the paper can be found [here](https://files.dice-research.org/projects/NCES/NCES/datasets.zip) -## Learning OWL Class Expression over DBpedia -```python -from ontolearn.learners import TDL -from ontolearn.triple_store import TripleStore -from ontolearn.learning_problem import PosNegLPStandard -from owlapy.owl_individual import OWLNamedIndividual -from owlapy import owl_expression_to_sparql, owl_expression_to_dl -from ontolearn.utils.static_funcs import save_owl_class_expressions -# (1) Initialize Triplestore -kb = TripleStore(url="http://dice-dbpedia.cs.upb.de:9080/sparql") -# (3) Initialize a learner. -model = TDL(knowledge_base=kb) -# (4) Define a description logic concept learning problem. -lp = PosNegLPStandard(pos={OWLNamedIndividual("http://dbpedia.org/resource/Angela_Merkel")}, - neg={OWLNamedIndividual("http://dbpedia.org/resource/Barack_Obama")}) -# (5) Learn description logic concepts best fitting (4). -h = model.fit(learning_problem=lp).best_hypotheses() -print(h) -print(owl_expression_to_dl(h)) -print(owl_expression_to_sparql(expression=h)) -save_owl_class_expressions(expressions=h,path="owl_prediction") -``` - -Fore more please refer to the [examples](https://github.com/dice-group/Ontolearn/tree/develop/examples) folder. - -## ontolearn-webservice +## Retrieval results from Table 2 -
Click me! +To reproduce our results, run the commands below -Load an RDF knowledge graph -```shell -ontolearn-webservice --path_knowledge_base KGs/Mutagenesis/mutagenesis.owl -``` -or launch a Tentris instance https://github.com/dice-group/tentris over Mutagenesis. ```shell -ontolearn-webservice --endpoint_triple_store http://0.0.0.0:9080/sparql -``` -The below code trains DRILL with 6 randomly generated learning problems -provided that **path_to_pretrained_drill** does not lead to a directory containing pretrained DRILL. -Thereafter, trained DRILL is saved in the directory **path_to_pretrained_drill**. -Finally, trained DRILL will learn an OWL class expression. -```python -import json -import requests -with open(f"LPs/Mutagenesis/lps.json") as json_file: - learning_problems = json.load(json_file)["problems"] -for str_target_concept, examples in learning_problems.items(): - response = requests.get('http://0.0.0.0:8000/cel', - headers={'accept': 'application/json', 'Content-Type': 'application/json'}, - json={"pos": examples['positive_examples'], - "neg": examples['negative_examples'], - "model": "Drill", - "path_embeddings": "mutagenesis_embeddings/Keci_entity_embeddings.csv", - "path_to_pretrained_drill": "pretrained_drill", - # if pretrained_drill exists, upload, otherwise train one and save it there - "num_of_training_learning_problems": 2, - "num_of_target_concepts": 3, - "max_runtime": 60000, # seconds - "iter_bound": 1 # number of iterations/applied refinement opt. - }) - print(response.json()) # {'Prediction': '∀ hasAtom.(¬Nitrogen-34)', 'F1': 0.7283582089552239, 'saved_prediction': 'Predictions.owl'} -``` -TDL (a more scalable learner) can also be used as follows -```python -import json -import requests -with open(f"LPs/Mutagenesis/lps.json") as json_file: - learning_problems = json.load(json_file)["problems"] -for str_target_concept, examples in learning_problems.items(): - response = requests.get('http://0.0.0.0:8000/cel', - headers={'accept': 'application/json', 'Content-Type': 'application/json'}, - json={"pos": examples['positive_examples'], - "neg": examples['negative_examples'], - "model": "TDL"}) - print(response.json()) -``` - - -
+python examples/retrieval_eval.py --path_kg "KGs/Family/father.owl" +# Results of the Father dataset -## Benchmark Results - -
To see the results - -```shell -# To download learning problems. # Benchmark learners on the Family benchmark dataset with benchmark learning problems. -wget https://files.dice-research.org/projects/Ontolearn/LPs.zip -O ./LPs.zip && unzip LPs.zip +python examples/retrieval_eval.py --path_kg "KGs/Family/family-benchmark_rich_background.owl" +# Results of the Family dataset ``` -### 10-Fold Cross Validation Family Benchmark Results +For larger datasets, we have to sample the number of entities and relations. For the experiments to run fast, we need to select the type of instance we are interested from line 136-140 of this [file](examples/retrieval_eval.py). Below we only present how to get results on semnatic Bible but for other datasets can be obtain similarly by adding the corect path to the argument ```--path_kg```. -Here we apply 10-fold cross validation technique on each benchmark learning problem with max runtime of 60 seconds to measure the training and testing performance of learners. -In the evaluation, from a given single learning problem (a set of positive and negative examples), a learner learns an OWL Class Expression (H) on a given 9 fold of positive and negative examples. -To compute the training performance, We compute F1-score of H train positive and negative examples. -To compute the test performance, we compute F1-score of H w.r.t. test positive and negative examples. - ```shell -# To download learning problems and benchmark learners on the Family benchmark dataset with benchmark learning problems. -python examples/concept_learning_cv_evaluation.py --kb ./KGs/Family/family-benchmark_rich_background.owl --lps ./LPs/Family/lps_difficult.json --path_of_nces_embeddings ./NCESData/family/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings ./CLIPData/family/embeddings/ConEx_entity_embeddings.csv --max_runtime 60 --report family_results.csv -``` -In the following python script, the results are summarized and the markdown displayed below generated. -```python -import pandas as pd -df=pd.read_csv("family_results.csv").groupby("LP").mean() -print(df[[col for col in df if col.startswith('Test-F1') or col.startswith('RT')]].to_markdown(floatfmt=".3f")) -``` -**Note that DRILL is untrained and we simply used accuracy driven heuristics to learn an OWL class expression.** +# results on the semnatic bible data -Below, we report the average test F1 score and the average runtimes of learners. +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc 1 --ratio_sample_object_prob 1 --path_report "ALCQI_semantic_seed_all_nc.csv" +# OWLClass expressions -| LP | Test-F1-OCEL | RT-OCEL | Test-F1-CELOE | RT-CELOE | Test-F1-Evo | RT-Evo | Test-F1-DRILL | RT-DRILL | Test-F1-TDL | RT-TDL | Test-F1-NCES | RT-NCES | Test-F1-CLIP | RT-CLIP | -|:------------------:|-------------:|--------:|--------------:|---------:|------------:|-------:|--------------:|---------:|------------:|-------:|-------------:|--------:|-------------:|--------:| -| Aunt | 0.614 | 13.697 | 0.855 | 13.697 | 0.978 | 5.278 | 0.811 | 60.351 | 0.956 | 0.118 | 0.812 | 1.168 | 0.855 | 14.059 | -| Cousin | 0.712 | 10.846 | 0.789 | 10.846 | 0.993 | 3.311 | 0.701 | 60.485 | 0.820 | 0.176 | 0.677 | 1.050 | 0.779 | 9.050 | -| Grandgranddaughter | 1.000 | 0.013 | 1.000 | 0.013 | 1.000 | 0.426 | 0.980 | 17.486 | 1.000 | 0.050 | 1.000 | 0.843 | 1.000 | 0.639 | -| Grandgrandfather | 1.000 | 0.897 | 1.000 | 0.897 | 1.000 | 0.404 | 0.947 | 55.728 | 0.947 | 0.059 | 0.927 | 0.902 | 1.000 | 0.746 | -| Grandgrandmother | 1.000 | 4.173 | 1.000 | 4.173 | 1.000 | 0.442 | 0.893 | 50.329 | 0.947 | 0.060 | 0.927 | 0.908 | 1.000 | 0.817 | -| Grandgrandson | 1.000 | 1.632 | 1.000 | 1.632 | 1.000 | 0.452 | 0.931 | 60.358 | 0.911 | 0.070 | 0.911 | 1.050 | 1.000 | 0.939 | -| Uncle | 0.876 | 16.244 | 0.891 | 16.244 | 0.964 | 4.516 | 0.876 | 60.416 | 0.933 | 0.098 | 0.891 | 1.256 | 0.928 | 17.682 | +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .5 --ratio_sample_object_prob .5 --path_report "ALCQI_semantic_seed_1_ratio_0.5_unions.csv" +# OWLObjectUnionOf +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc 1 --ratio_sample_object_prob 1 --path_report "ALCQI_semantic_seed_1_interALCQI_semantic_seed_all_nc.csv" +# OWLObjectComplementOf -| LP | Train-F1-OCEL | Train-F1-CELOE | Train-F1-Evo | Train-F1-DRILL | Train-F1-TDL | Train-F1-NCES | Train-F1-CLIP | -|:------------------:|--------------:|---------------:|-------------:|---------------:|-------------:|----------------:|----------------:| -| Aunt | 0.835 | 0.918 | 0.995 | 0.837 | 1.000 | 0.804 | 0.918 | -| Cousin | 0.746 | 0.796 | 1.000 | 0.732 | 1.000 | 0.681 | 0.798 | -| Grandgranddaughter | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | -| Grandgrandfather | 1.000 | 1.000 | 1.000 | 0.968 | 1.000 | 0.973 | 1.000 | -| Grandgrandmother | 1.000 | 1.000 | 1.000 | 0.975 | 1.000 | 0.939 | 1.000 | -| Grandgrandson | 1.000 | 1.000 | 1.000 | 0.962 | 1.000 | 0.927 | 1.000 | -| Uncle | 0.904 | 0.907 | 0.996 | 0.908 | 1.000 | 0.884 | 0.940 | +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .5 --ratio_sample_object_prob .5 --path_report "ALCQI_semantic_seed_1_ratio_0.5_inter.csv" +# OWLObjectIntersectionOf +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .2 --ratio_sample_object_prob .2 --path_report "ALCQI_semantic_seed_1_ratio_02_exits.csv" +# OWLObjectSomeValuesFrom -### 10-Fold Cross Validation Mutagenesis Benchmark Results -```shell -python examples/concept_learning_cv_evaluation.py --kb ./KGs/Mutagenesis/mutagenesis.owl --lps ./LPs/Mutagenesis/lps.json --path_of_nces_embeddings ./NCESData/mutagenesis/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings ./CLIPData/mutagenesis/embeddings/ConEx_entity_embeddings.csv --max_runtime 60 --report mutagenesis_results.csv -``` - -| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | -|:---------|--------------:|-------------:|--------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|--------------:|-------------:|--------:|--------------:|-------------:|--------:| -| NotKnown | 0.916 | 0.918 | 60.705 | 0.916 | 0.918 | 60.705 | 0.975 | 0.970 | 51.870 | 0.809 | 0.804 | 60.140 | 1.000 | 0.852 | 13.569 | 0.717 | 0.718 | 3.784 | 0.916 | 0.918 | 26.312| +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .2 --ratio_sample_object_prob .2 --path_report "ALCQI_semantic_seed_1_ratio_02_forall.csv" +# OWLObjectAllValuesFrom +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .1 --ratio_sample_object_prob .1 --path_report "ALCQI_semantic_seed_1_ratio_02_min_card.csv" +# minimum cardinality restrictions, n = {1,2,3} - -### 10-Fold Cross Validation Carcinogenesis Benchmark Results -```shell -python examples/concept_learning_cv_evaluation.py --kb ./KGs/Carcinogenesis/carcinogenesis.owl --lps ./LPs/Carcinogenesis/lps.json --path_of_nces_embeddings ./NCESData/carcinogenesis/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings ./CLIPData/carcinogenesis/embeddings/ConEx_entity_embeddings.csv --max_runtime 60 --report carcinogenesis_results.csv +python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .1 --ratio_sample_object_prob .1 --path_report "ALCQI_semantic_seed_1_ratio_02_max_card.csv" +# max cardinality restrictions, n = {1,2,3} ``` -| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | -|:---------|--------------:|-------------:|--------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|--------------:|-------------:|--------:|--------------:|-------------:|--------:| -| NOTKNOWN | 0.737 | 0.711 | 62.048 | 0.740 | 0.701 | 62.048 | 0.822 | 0.628 | 64.508 | 0.740 | 0.707 | 60.120 | 1.000 | 0.616 | 5.196 | 0.705 | 0.704 | 4.157 | 0.740 | 0.701 | 48.475| +## Results from Table 3 -
+To obtain the results From Table 3, run the following commands: -## Development +```shell +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/father.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 +# Results of the Father dataset +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/family-benchmark_rich_background.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 +# Results of the Family dataset -
To see the results +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 --sample Yes +# Results of the Semantic Bible dataset -Creating a feature branch **refactoring** from development branch +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Mutagenesis/mutagenesis.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 --sample Yes +# Results of the Mutagenesis dataset -```shell -git branch refactoring develop +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Mutagenesis/mutagenesis.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 --sample Yes +# Results of the Carcinogenesis dataset ``` +To get the results with other ratio (0.1, 0.2, 0.6, 0.8, 0.9 etc...), just add it after the argument ```--level_of_incompleteness``` and run the same command. For results on inconcistencies, just change the argument ```--operation``` to "inconsistent" (this will not necessary make the KB inconsistent but will add noises in the data at the choosen level). See below for an example on the Father and Family datasets. -Each feature branch must be merged to develop branch. To this end, the tests must run without a problem: ```shell -# To download knowledge graphs -wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip -# To download learning problems -wget https://files.dice-research.org/projects/Ontolearn/LPs.zip -O ./LPs.zip && unzip LPs.zip -# Download weights for some model for few tests -wget https://files.dice-research.org/projects/NCES/NCES_Ontolearn_Data/NCESData.zip -O ./NCESData.zip && unzip NCESData.zip && rm NCESData.zip -wget https://files.dice-research.org/projects/Ontolearn/CLIP/CLIPData.zip && unzip CLIPData.zip && rm CLIPData.zip -pytest -p no:warnings -x # Running 76 tests takes ~ 17 mins -``` - +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/father.owl" --level_of_incompleteness 0.4 --operation "inconsistent" --number_of_incomplete_graphs 5 +# Results of the Father dataset +python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/family-benchmark_rich_background.owl" --level_of_incompleteness 0.4 --operation "inconsistent" --number_of_incomplete_graphs 5 +# Results of the Family dataset +``` -
+Or more simply, just create a bash file as shown [here](run_multiple_carcinogenesis.sh) for the carcinogenesis and execute it using -## References -Currently, we are working on our manuscript describing our framework. -If you find our work useful in your research, please consider citing the respective paper: -``` -# DRILL -@inproceedings{demir2023drill, - author = {Demir, Caglar and Ngomo, Axel-Cyrille Ngonga}, - booktitle = {The 32nd International Joint Conference on Artificial Intelligence, IJCAI 2023}, - title = {Neuro-Symbolic Class Expression Learning}, - url = {https://www.ijcai.org/proceedings/2023/0403.pdf}, - year={2023} -} - -# NCES2 -@inproceedings{kouagou2023nces2, -author={Kouagou, N'Dah Jean and Heindorf, Stefan and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, -title={Neural Class Expression Synthesis in ALCHIQ(D)}, -url = {https://papers.dice-research.org/2023/ECML_NCES2/NCES2_public.pdf}, -booktitle={Machine Learning and Knowledge Discovery in Databases}, -year={2023}, -publisher={Springer Nature Switzerland}, -address="Cham" -} - -# NCES -@inproceedings{kouagou2023neural, - title={Neural class expression synthesis}, - author={Kouagou, N’Dah Jean and Heindorf, Stefan and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, - booktitle={European Semantic Web Conference}, - pages={209--226}, - year={2023}, - publisher={Springer Nature Switzerland} -} - -# EvoLearner -@inproceedings{heindorf2022evolearner, - title={Evolearner: Learning description logics with evolutionary algorithms}, - author={Heindorf, Stefan and Bl{\"u}baum, Lukas and D{\"u}sterhus, Nick and Werner, Till and Golani, Varun Nandkumar and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, - booktitle={Proceedings of the ACM Web Conference 2022}, - pages={818--828}, - year={2022} -} - - -# CLIP -@inproceedings{kouagou2022learning, - title={Learning Concept Lengths Accelerates Concept Learning in ALC}, - author={Kouagou, N’Dah Jean and Heindorf, Stefan and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, - booktitle={European Semantic Web Conference}, - pages={236--252}, - year={2022}, - publisher={Springer Nature Switzerland} -} +```shell +chmod +x run_multiple_carcinogenesis.sh +``` +This will make the file executable then do +```shell +./run_multiple_carcinogenesis.sh ``` +This will run the carcinogenesis data with different level of inconsistencies. + +## Example of Concepts retrieval results on Father dataset: -In case you have any question, please contact: ```caglar.demir@upb.de``` or ```caglardemir8@gmail.com``` +| | Expression | Type | Jaccard Similarity | F1 | Runtime Benefits | Runtime EBR | Symbolic Retrieval | EBR Retrieval | +|---|------------------------|--------------------------|--------------------|-----|-----------------------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 0 | female ⊓ male | OWLObjectIntersectionOf | 1.0 | 1.0 | 0.054 | 0.003 | set() | set() | +| 1 | ∃ hasChild.female | OWLObjectSomeValuesFrom | 1.0 | 1.0 | -0.001 | 0.001 | {'http://example.com/father#markus'} | {'http://example.com/father#markus'} | +| 2 | person ⊔ (¬person) | OWLObjectUnionOf | 1.0 | 1.0 | -0.003 | 0.003 | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | +| 3 | person ⊓ person | OWLObjectIntersectionOf | 1.0 | 1.0 | -0.002 | 0.002 | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | +| 4 | person ⊔ person | OWLObjectUnionOf | 1.0 | 1.0 | -0.002 | 0.002 | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#anna', 'http://example.com/father#markus', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index 84f4939e..f9ad3eaf 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -78,9 +78,10 @@ def execute(args): list_jaccard_neural = [] data = [] - # subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--ratio_sample_nc","0.02", "--ratio_sample_object_prob", "0.2", "--path_report", path_report]) - - subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--path_report", path_report]) + if args.sample == "Yes": + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--ratio_sample_nc","0.02", "--ratio_sample_object_prob", "0.2", "--path_report", path_report]) + else: + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--path_report", path_report]) df = pd.read_csv(f"{directory}/ALCQHI_Retrieval_Results.csv", index_col=0) @@ -197,11 +198,12 @@ def get_default_arguments(): parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Incomplete_Results.csv") - parser.add_argument("--number_of_incomplete_graphs", type = int, default=1) - parser.add_argument("--level_of_incompleteness", type = float, default=0.1, \ + parser.add_argument("--number_of_incomplete_graphs", type=int, default=1) + parser.add_argument("--level_of_incompleteness", type=float, default=0.1, \ help="Percentage of incompleteness or inconsistency from the original KG between 0 and 1") - parser.add_argument("--operation", type = str, default= "incomplete", choices=["incomplete", "inconsistent"],\ + parser.add_argument("--operation", type=str, default="incomplete", choices=["incomplete", "inconsistent"],\ help = "Choose to make the KB incomplete or inconsistent") + parser.add_argument("--sample", type=str, default="No", choices=["No", "Yes"], help = "Sample if needed") return parser.parse_args() diff --git a/run_multiple_carcinogenesis.sh b/run_multiple_carcinogenesis.sh index 5bf7e17e..f7ce02df 100644 --- a/run_multiple_carcinogenesis.sh +++ b/run_multiple_carcinogenesis.sh @@ -1,4 +1,5 @@ #!/bin/bash +#You can directly choose the needed operation inside the file examples/retrieval_eval_under_incomplete.py. # Define the path to your Python script PYTHON_SCRIPT="examples/retrieval_eval_under_incomplete.py" @@ -8,7 +9,7 @@ path_kg="KGs/Carcinogenesis/carcinogenesis.owl" NUMBER_OF_INCOMPLETE_GRAPHS=5 # Define the list of levels of incompleteness -LEVELS_OF_INCOMPLETENESS=("0.8" "0.9") +LEVELS_OF_INCOMPLETENESS=("0.4" "0.8" "0.9") # LEVELS_OF_INCOMPLETENESS=("0.1") # Iterate over each level of incompleteness From 471e140f574e18bdd472561a53996ca7d56bfba9 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 2 Oct 2024 08:57:37 +0200 Subject: [PATCH 07/12] Update README.md Links are removed or shortened to ensure the anonymity --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index dee7677f..aef91641 100644 --- a/README.md +++ b/README.md @@ -5,15 +5,13 @@ This repository provides the implementation of our reasoner EBR. With this repos ## Installation - ```shell -git clone https://github.com/dice-group/Ontolearn.git -b retrieval_eval_incomplete # To create a virtual python env with conda conda create -n venv python=3.10.14 --no-default-packages && conda activate venv && pip install -e . && cd Ontolearn -# To download knowledge graphs -wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip +# To download the benchmark datasets knowledge graphs +wget https://shorturl.at/0T4WJ -O ./KGs.zip && unzip KGs.zip ``` -Other datasets used in the paper can be found [here](https://files.dice-research.org/projects/NCES/NCES/datasets.zip) +Other datasets used in the paper can be found [here](https://shorturl.at/v28n0) ## Retrieval results from Table 2 From e9c46ba95f79a0a094eff975db151bc0dea3a7bf Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 2 Oct 2024 09:01:18 +0200 Subject: [PATCH 08/12] Update README.md Implementation based on ontolearn info added --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index aef91641..bf0c3014 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ This repository provides the implementation of our reasoner EBR. With this repository, once can perform instance retrieval even within an incomplete and inconsistent knowldege base. EBR leverages KGE to perform reasoning over incomplete and inconsistent knowledge bases (KBs). We employ a neural link predictor to facilitate the retrieval of missing data and handle inconsistencies. +We based our implementation on [Ontolearn](https://github.com/dice-group/Ontolearn). We would like to thank for the readable codebase. + ## Installation ```shell From edae1763b550f2181253fd091b3f100d1d892111 Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Mon, 7 Oct 2024 17:54:01 +0200 Subject: [PATCH 09/12] Code now reproducible --- examples/retrieval_eval.py | 25 ++++++++----- examples/retrieval_eval_under_incomplete.py | 40 +++++++++++---------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index b5db22ed..d3ef394c 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -46,22 +46,29 @@ def execute(args): ################################################################### # GENERATE ALCQ CONCEPTS TO EVALUATE RETRIEVAL PERFORMANCES # (3) R: Extract object properties. - object_properties = {i for i in symbolic_kb.get_object_properties()} + object_properties = sorted({i for i in symbolic_kb.get_object_properties()}) + # (3.1) Subsample if required. if args.ratio_sample_object_prob: object_properties = {i for i in random.sample(population=list(object_properties), - k=max(1, int(len(object_properties) * args.ratio_sample_nc)))} + k=max(1, int(len(object_properties) * args.ratio_sample_object_prob)))} + + object_properties = set(object_properties) + # (4) R⁻: Inverse of object properties. object_properties_inverse = {i.get_inverse_property() for i in object_properties} + # (5) R*: R UNION R⁻. object_properties_and_inverse = object_properties.union(object_properties_inverse) # (6) NC: Named owl concepts. - nc = {i for i in symbolic_kb.get_concepts()} + nc = sorted({i for i in symbolic_kb.get_concepts()}) + if args.ratio_sample_nc: # (6.1) Subsample if required. nc = {i for i in random.sample(population=list(nc), k=max(1, int(len(nc) * args.ratio_sample_nc)))} + nc = set(nc) # return to a set # (7) NC⁻: Complement of NC. nnc = {i.get_object_complement_of() for i in nc} # (8) UNNC: NC UNION NC⁻. @@ -133,15 +140,15 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Converted to list so that the progress bar works. concepts = list( chain( - nc, unions, intersections, nnc, unions_unnc, intersections_unnc, - exist_unnc, for_all_unnc, - min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, - max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, - exist_nominals, + nc, unions, intersections, nnc, #unions_unnc, intersections_unnc, + # exist_unnc, for_all_unnc, + # min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, + # max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, + # exist_nominals, ) ) # () Shuffled the data so that the progress bar is not influenced by the order of concepts. - + random.shuffle(concepts) # () Iterate over single OWL Class Expressions in ALCQIHO diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index f9ad3eaf..a712bfa0 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -5,7 +5,8 @@ from ontolearn.knowledge_base import KnowledgeBase import pandas as pd from typing import Set -from incomplete_kb import * +import time +from incomplete_kb import make_kb_incomplete, make_kb_inconsistent import os from ontolearn.utils import jaccard_similarity import subprocess @@ -20,7 +21,7 @@ # [] Create sub/incomplete KGs -def generated_incomplete_kg(kb_path: str, directory: str, n: int, ratio: float, operation: str) -> Set[str]: +def generate_subgraphs(kb_path: str, directory: str, n: int, ratio: float, operation: str) -> Set[str]: name = kb_path.split('/')[-1].split('.')[0] rate = int(ratio * 100) @@ -60,13 +61,13 @@ def execute(args): namespace = list(symbolic_kb.ontology.classes_in_signature())[0].iri.get_namespace() parser = DLSyntaxParser(namespace) name_KG = args.path_kg.split('/')[-1].split('.')[0] - level_of_incompleteness_str = str(args.level_of_incompleteness).replace('.', '_') - directory = f"{args.operation}_{name_KG}_{level_of_incompleteness_str}" - paths_of_incomplete_kgs = generated_incomplete_kg( + ratio_str = str(args.ratio).replace('.', '_') + directory = f"{args.operation}_{name_KG}_{ratio_str}" + paths_of_subgraphs = generate_subgraphs( kb_path=args.path_kg, directory=directory, - n=args.number_of_incomplete_graphs, - ratio=args.level_of_incompleteness, + n=args.number_of_subgraphs, + ratio=args.ratio, operation=args.operation ) path_report = f"{directory}/ALCQHI_Retrieval_Results.csv" @@ -74,20 +75,21 @@ def execute(args): expressions = None all_results = [] - for path_of_an_incomplete_kgs in paths_of_incomplete_kgs: + for path in paths_of_subgraphs: + list_jaccard_neural = [] data = [] if args.sample == "Yes": - subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--ratio_sample_nc","0.02", "--ratio_sample_object_prob", "0.2", "--path_report", path_report]) + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.2", "--ratio_sample_object_prob", "0.1", "--path_report", path_report]) else: - subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path_of_an_incomplete_kgs, "--path_report", path_report]) + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--path_report", path_report]) df = pd.read_csv(f"{directory}/ALCQHI_Retrieval_Results.csv", index_col=0) expressions = {i for i in df["Expression"].to_list()} - ontology_path = path_of_an_incomplete_kgs + ontology_path = path reasoners = ['HermiT', 'Pellet', 'JFact', 'Openllet'] reasoner_jaccards = {reasoner: [] for reasoner in reasoners} reasoner_times = {reasoner: [] for reasoner in reasoners} # To store running times @@ -108,7 +110,7 @@ def execute(args): list_jaccard_neural.append(jaccard_sim_neural) result_row = { - "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], + "Incomplete_KG": path.split('/')[-1], "Expression": expression, "Type": type(parser.parse_expression(expression)).__name__, "Jaccard_EBR": jaccard_sim_neural, @@ -146,10 +148,10 @@ def execute(args): avg_jaccard_reasoners = {reasoner: sum(reasoner_jaccards[reasoner]) / len(reasoner_jaccards[reasoner]) for reasoner in reasoners} avg_time_reasoners = {reasoner: sum(reasoner_times[reasoner]) / len(reasoner_times[reasoner]) for reasoner in reasoners} - print(f"Average Jaccard neural ({path_of_an_incomplete_kgs}):", avg_jaccard_neural) + print(f"Average Jaccard neural ({path}):", avg_jaccard_neural) for reasoner, avg_jaccard in avg_jaccard_reasoners.items(): - print(f"Average Jaccard {reasoner} ({path_of_an_incomplete_kgs}):", avg_jaccard) - print(f"Average Runtime {reasoner} ({path_of_an_incomplete_kgs}):", avg_time_reasoners[reasoner]) + print(f"Average Jaccard {reasoner} ({path}):", avg_jaccard) + print(f"Average Runtime {reasoner} ({path}):", avg_time_reasoners[reasoner]) else: @@ -165,7 +167,7 @@ def execute(args): list_jaccard_neural.append(jaccard_sim_neural) result_row = { - "Incomplete_KG": path_of_an_incomplete_kgs.split('/')[-1], + "Subgraphs": path.split('/')[-1], "Expression": expression, "Type": type(parser.parse_expression(expression)).__name__, "Jaccard_EBR": jaccard_sim_neural, @@ -193,13 +195,13 @@ def execute(args): def get_default_arguments(): parser = ArgumentParser() - parser.add_argument("--path_kg", type=str, default="KGs/Family/father.owl") + parser.add_argument("--path_kg", type=str, default="KGs/Family/family-benchmark_rich_background.owl") parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Incomplete_Results.csv") - parser.add_argument("--number_of_incomplete_graphs", type=int, default=1) - parser.add_argument("--level_of_incompleteness", type=float, default=0.1, \ + parser.add_argument("--number_of_subgraphs", type=int, default=1) + parser.add_argument("--ratio", type=float, default=0.1, \ help="Percentage of incompleteness or inconsistency from the original KG between 0 and 1") parser.add_argument("--operation", type=str, default="incomplete", choices=["incomplete", "inconsistent"],\ help = "Choose to make the KB incomplete or inconsistent") From 77b441b0e85baa5b4d32bca6882546e26c184d8a Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Wed, 9 Oct 2024 15:24:11 +0200 Subject: [PATCH 10/12] code refactoring --- examples/retrieval_eval.py | 14 +++++------ examples/retrieval_eval_under_incomplete.py | 28 ++++++++++++++++++--- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index d3ef394c..82565169 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -140,11 +140,11 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: # () Converted to list so that the progress bar works. concepts = list( chain( - nc, unions, intersections, nnc, #unions_unnc, intersections_unnc, - # exist_unnc, for_all_unnc, - # min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, - # max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, - # exist_nominals, + nc, unions, intersections, nnc, unions_unnc, intersections_unnc, + exist_unnc, for_all_unnc, + min_cardinality_unnc_1, min_cardinality_unnc_2, min_cardinality_unnc_3, + max_cardinality_unnc_1, max_cardinality_unnc_2, max_cardinality_unnc_3, + exist_nominals, ) ) # () Shuffled the data so that the progress bar is not influenced by the order of concepts. @@ -209,8 +209,8 @@ def get_default_arguments(): parser.add_argument("--endpoint_triple_store", type=str, default=None) parser.add_argument("--gamma", type=float, default=0.9) parser.add_argument("--seed", type=int, default=1) - parser.add_argument("--ratio_sample_nc", type=float, default=None, help="To sample OWL Classes.") - parser.add_argument("--ratio_sample_object_prob", type=float, default=None, help="To sample OWL Object Properties.") + parser.add_argument("--ratio_sample_nc", type=float, default=0.2, help="To sample OWL Classes.") + parser.add_argument("--ratio_sample_object_prob", type=float, default=0.1, help="To sample OWL Object Properties.") # H is obtained if the forward chain is applied on KG. parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Results.csv") return parser.parse_args() diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index a712bfa0..65656891 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -1,6 +1,5 @@ -""" -TODO: Write few lines of code to run this script and explanations -""" +"""python examples/retrieval_eval_under_incomplete.py""" + from argparse import ArgumentParser from ontolearn.knowledge_base import KnowledgeBase import pandas as pd @@ -23,6 +22,27 @@ # [] Create sub/incomplete KGs def generate_subgraphs(kb_path: str, directory: str, n: int, ratio: float, operation: str) -> Set[str]: + """ + Generates a specified number of paths of subgraphs (incomplete or noisy knowledge graphs) + by applying either the "incomplete" or "inconsistent" operation from the functions make_kb_incomplete and + make_kb_inconsistent to the given KB. + + Inputs: + --------------- + + kb_path (str): The path to the input KB file. + directory (str): The directory where the generated subgraphs will be stored. + n (int): The number of subgraphs to generate. + ratio (float): The ratio of elements to modify within the KB (as a percentage). + operation (str): The type of operation to perform on the KB. Expected values are + "incomplete" or "inconsistent", which define the type of subgraph to generate. + + Output: + --------------- + + Set[str]: A set containing the file paths of all the generated subgraphs. + """ + name = kb_path.split('/')[-1].split('.')[0] rate = int(ratio * 100) @@ -81,7 +101,7 @@ def execute(args): data = [] if args.sample == "Yes": - subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.2", "--ratio_sample_object_prob", "0.1", "--path_report", path_report]) + subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--ratio_sample_nc","0.1", "--ratio_sample_object_prob", "0.2", "--path_report", path_report]) else: subprocess.run(['python', 'examples/retrieval_eval.py', "--path_kg", path, "--path_report", path_report]) From cfc0da95d82490d09a2620af6787e1d0fc82efbf Mon Sep 17 00:00:00 2001 From: Louis-Mozart Date: Tue, 15 Oct 2024 12:13:10 +0200 Subject: [PATCH 11/12] Refactoring --- README.md | 359 ++++++++++++++++---- examples/incomplete_kb.py | 320 ----------------- examples/retrieval_eval.py | 5 +- examples/retrieval_eval_under_incomplete.py | 16 +- ontolearn/incomplete_kb.py | 203 +++++++++++ run_multiple_carcinogenesis.sh | 22 -- 6 files changed, 512 insertions(+), 413 deletions(-) delete mode 100644 examples/incomplete_kb.py create mode 100644 ontolearn/incomplete_kb.py delete mode 100644 run_multiple_carcinogenesis.sh diff --git a/README.md b/README.md index bf0c3014..953f97f1 100644 --- a/README.md +++ b/README.md @@ -1,109 +1,334 @@ +[![Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://ontolearn-docs-dice-group.netlify.app/usage/09_further_resources#code-coverage) +[![Pypi](https://img.shields.io/badge/pypi-0.7.4-blue)](https://pypi.org/project/ontolearn/0.7.4/) +[![Docs](https://img.shields.io/badge/documentation-0.7.4-yellow)](https://ontolearn-docs-dice-group.netlify.app/usage/01_introduction) +[![Python](https://img.shields.io/badge/python-3.10.13+-4584b6)](https://www.python.org/downloads/release/python-31013/) +  -## Neural Description Logic Reasoning over incomplete Knowledge Base +![Ontolearn](docs/_static/images/Ontolearn_logo.png) -This repository provides the implementation of our reasoner EBR. With this repository, once can perform instance retrieval even within an incomplete and inconsistent knowldege base. EBR leverages KGE to perform reasoning over incomplete and inconsistent knowledge bases (KBs). We employ a neural link predictor to facilitate the retrieval of missing data and handle inconsistencies. +# Ontolearn: Learning OWL Class Expressions -We based our implementation on [Ontolearn](https://github.com/dice-group/Ontolearn). We would like to thank for the readable codebase. +*Ontolearn* is an open-source software library for learning owl class expressions at large scale. + +Given positive and negative [OWL named individual](https://www.w3.org/TR/owl2-syntax/#Individuals) examples +$E^+$ and $E^-$, learning [OWL Class expression](https://www.w3.org/TR/owl2-syntax/#Class_Expressions) problem refers to the following supervised Machine Learning problem + +$$\forall p \in E^+\ \mathcal{K} \models H(p) \wedge \forall n \in E^-\ \mathcal{K} \not \models H(n).$$ + +To tackle this supervised learning problem, ontolearn offers many symbolic, neuro-symbolic and deep learning based Learning algorithms: +- **Drill** → [Neuro-Symbolic Class Expression Learning](https://www.ijcai.org/proceedings/2023/0403.pdf) +- **EvoLearner** → [EvoLearner: Learning Description Logics with Evolutionary Algorithms](https://dl.acm.org/doi/abs/10.1145/3485447.3511925) +- **NCES2** → (soon) [Neural Class Expression Synthesis in ALCHIQ(D)](https://papers.dice-research.org/2023/ECML_NCES2/NCES2_public.pdf) +- **NCES** → [Neural Class Expression Synthesis](https://link.springer.com/chapter/10.1007/978-3-031-33455-9_13) +- **NERO** → (soon) [Learning Permutation-Invariant Embeddings for Description Logic Concepts](https://link.springer.com/chapter/10.1007/978-3-031-30047-9_9) +- **CLIP** → [Learning Concept Lengths Accelerates Concept Learning in ALC](https://link.springer.com/chapter/10.1007/978-3-031-06981-9_14) +- **CELOE** → [Class Expression Learning for Ontology Engineering](https://www.sciencedirect.com/science/article/abs/pii/S1570826811000023) +- **OCEL** → A limited version of CELOE + +Find more in the [Documentation](https://ontolearn-docs-dice-group.netlify.app/usage/01_introduction). ## Installation ```shell +pip install ontolearn +``` +or +```shell +git clone https://github.com/dice-group/Ontolearn.git # To create a virtual python env with conda -conda create -n venv python=3.10.14 --no-default-packages && conda activate venv && pip install -e . && cd Ontolearn -# To download the benchmark datasets knowledge graphs -wget https://shorturl.at/0T4WJ -O ./KGs.zip && unzip KGs.zip +conda create -n venv python=3.10.14 --no-default-packages && conda activate venv && pip install -e . +# To download knowledge graphs +wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip +# To download learning problems +wget https://files.dice-research.org/projects/Ontolearn/LPs.zip -O ./LPs.zip && unzip LPs.zip ``` -Other datasets used in the paper can be found [here](https://shorturl.at/v28n0) -## Retrieval results from Table 2 +## Learning OWL Class Expression +```python +from ontolearn.learners import TDL +from ontolearn.triple_store import TripleStore +from ontolearn.knowledge_base import KnowledgeBase +from ontolearn.learning_problem import PosNegLPStandard +from owlapy.owl_individual import OWLNamedIndividual +from owlapy import owl_expression_to_sparql, owl_expression_to_dl +# (1) Initialize Triplestore or KnowledgeBase +# sudo docker run -p 3030:3030 -e ADMIN_PASSWORD=pw123 stain/jena-fuseki +# Login http://localhost:3030/#/ with admin and pw123 and upload KGs/Family/family.owl +# kb = TripleStore(url="http://localhost:3030/family") +kb = KnowledgeBase(path="KGs/Family/father.owl") +# (2) Initialize a learner. +model = TDL(knowledge_base=kb, use_nominals=True) +# (3) Define a description logic concept learning problem. +lp = PosNegLPStandard(pos={OWLNamedIndividual("http://example.com/father#stefan")}, + neg={OWLNamedIndividual("http://example.com/father#heinz"), + OWLNamedIndividual("http://example.com/father#anna"), + OWLNamedIndividual("http://example.com/father#michelle")}) +# (4) Learn description logic concepts best fitting (3). +h = model.fit(learning_problem=lp).best_hypotheses() +print(h) +print(owl_expression_to_dl(h)) +print(owl_expression_to_sparql(expression=h)) +""" +OWLObjectSomeValuesFrom(property=OWLObjectProperty(IRI('http://example.com/father#','hasChild')),filler=OWLObjectOneOf((OWLNamedIndividual(IRI('http://example.com/father#','markus')),))) + +∃ hasChild.{markus} + +SELECT + DISTINCT ?x WHERE { +?x ?s_1 . + FILTER ( ?s_1 IN ( + + ) ) + } +""" +print(model.classification_report) +""" +Classification Report: Negatives: -1 and Positives 1 + precision recall f1-score support + + Negative 1.00 1.00 1.00 3 + Positive 1.00 1.00 1.00 1 + + accuracy 1.00 4 + macro avg 1.00 1.00 1.00 4 +weighted avg 1.00 1.00 1.00 4 +""" +``` -To reproduce our results, run the commands below +## Learning OWL Class Expression over DBpedia +```python +from ontolearn.learners import TDL +from ontolearn.triple_store import TripleStore +from ontolearn.learning_problem import PosNegLPStandard +from owlapy.owl_individual import OWLNamedIndividual +from owlapy import owl_expression_to_sparql, owl_expression_to_dl +from ontolearn.utils.static_funcs import save_owl_class_expressions +# (1) Initialize Triplestore +kb = TripleStore(url="http://dice-dbpedia.cs.upb.de:9080/sparql") +# (3) Initialize a learner. +model = TDL(knowledge_base=kb) +# (4) Define a description logic concept learning problem. +lp = PosNegLPStandard(pos={OWLNamedIndividual("http://dbpedia.org/resource/Angela_Merkel")}, + neg={OWLNamedIndividual("http://dbpedia.org/resource/Barack_Obama")}) +# (5) Learn description logic concepts best fitting (4). +h = model.fit(learning_problem=lp).best_hypotheses() +print(h) +print(owl_expression_to_dl(h)) +print(owl_expression_to_sparql(expression=h)) +save_owl_class_expressions(expressions=h,path="owl_prediction") +``` -```shell -python examples/retrieval_eval.py --path_kg "KGs/Family/father.owl" -# Results of the Father dataset +Fore more please refer to the [examples](https://github.com/dice-group/Ontolearn/tree/develop/examples) folder. -python examples/retrieval_eval.py --path_kg "KGs/Family/family-benchmark_rich_background.owl" -# Results of the Family dataset +## ontolearn-webservice + +
Click me! + +Load an RDF knowledge graph +```shell +ontolearn-webservice --path_knowledge_base KGs/Mutagenesis/mutagenesis.owl +``` +or launch a Tentris instance https://github.com/dice-group/tentris over Mutagenesis. +```shell +ontolearn-webservice --endpoint_triple_store http://0.0.0.0:9080/sparql ``` +The below code trains DRILL with 6 randomly generated learning problems +provided that **path_to_pretrained_drill** does not lead to a directory containing pretrained DRILL. +Thereafter, trained DRILL is saved in the directory **path_to_pretrained_drill**. +Finally, trained DRILL will learn an OWL class expression. +```python +import json +import requests +with open(f"LPs/Mutagenesis/lps.json") as json_file: + learning_problems = json.load(json_file)["problems"] +for str_target_concept, examples in learning_problems.items(): + response = requests.get('http://0.0.0.0:8000/cel', + headers={'accept': 'application/json', 'Content-Type': 'application/json'}, + json={"pos": examples['positive_examples'], + "neg": examples['negative_examples'], + "model": "Drill", + "path_embeddings": "mutagenesis_embeddings/Keci_entity_embeddings.csv", + "path_to_pretrained_drill": "pretrained_drill", + # if pretrained_drill exists, upload, otherwise train one and save it there + "num_of_training_learning_problems": 2, + "num_of_target_concepts": 3, + "max_runtime": 60000, # seconds + "iter_bound": 1 # number of iterations/applied refinement opt. + }) + print(response.json()) # {'Prediction': '∀ hasAtom.(¬Nitrogen-34)', 'F1': 0.7283582089552239, 'saved_prediction': 'Predictions.owl'} +``` +TDL (a more scalable learner) can also be used as follows +```python +import json +import requests +with open(f"LPs/Mutagenesis/lps.json") as json_file: + learning_problems = json.load(json_file)["problems"] +for str_target_concept, examples in learning_problems.items(): + response = requests.get('http://0.0.0.0:8000/cel', + headers={'accept': 'application/json', 'Content-Type': 'application/json'}, + json={"pos": examples['positive_examples'], + "neg": examples['negative_examples'], + "model": "TDL"}) + print(response.json()) +``` + + +
-For larger datasets, we have to sample the number of entities and relations. For the experiments to run fast, we need to select the type of instance we are interested from line 136-140 of this [file](examples/retrieval_eval.py). Below we only present how to get results on semnatic Bible but for other datasets can be obtain similarly by adding the corect path to the argument ```--path_kg```. +## Benchmark Results + +
To see the results ```shell -# results on the semnatic bible data +# To download learning problems. # Benchmark learners on the Family benchmark dataset with benchmark learning problems. +wget https://files.dice-research.org/projects/Ontolearn/LPs.zip -O ./LPs.zip && unzip LPs.zip +``` -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc 1 --ratio_sample_object_prob 1 --path_report "ALCQI_semantic_seed_all_nc.csv" -# OWLClass expressions +### 10-Fold Cross Validation Family Benchmark Results -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .5 --ratio_sample_object_prob .5 --path_report "ALCQI_semantic_seed_1_ratio_0.5_unions.csv" -# OWLObjectUnionOf +Here we apply 10-fold cross validation technique on each benchmark learning problem with max runtime of 60 seconds to measure the training and testing performance of learners. +In the evaluation, from a given single learning problem (a set of positive and negative examples), a learner learns an OWL Class Expression (H) on a given 9 fold of positive and negative examples. +To compute the training performance, We compute F1-score of H train positive and negative examples. +To compute the test performance, we compute F1-score of H w.r.t. test positive and negative examples. + +```shell +# To download learning problems and benchmark learners on the Family benchmark dataset with benchmark learning problems. +python examples/concept_learning_cv_evaluation.py --kb ./KGs/Family/family-benchmark_rich_background.owl --lps ./LPs/Family/lps_difficult.json --path_of_nces_embeddings ./NCESData/family/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings ./CLIPData/family/embeddings/ConEx_entity_embeddings.csv --max_runtime 60 --report family_results.csv +``` +In the following python script, the results are summarized and the markdown displayed below generated. +```python +import pandas as pd +df=pd.read_csv("family_results.csv").groupby("LP").mean() +print(df[[col for col in df if col.startswith('Test-F1') or col.startswith('RT')]].to_markdown(floatfmt=".3f")) +``` +**Note that DRILL is untrained and we simply used accuracy driven heuristics to learn an OWL class expression.** -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc 1 --ratio_sample_object_prob 1 --path_report "ALCQI_semantic_seed_1_interALCQI_semantic_seed_all_nc.csv" -# OWLObjectComplementOf +Below, we report the average test F1 score and the average runtimes of learners. -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .5 --ratio_sample_object_prob .5 --path_report "ALCQI_semantic_seed_1_ratio_0.5_inter.csv" -# OWLObjectIntersectionOf +| LP | Test-F1-OCEL | RT-OCEL | Test-F1-CELOE | RT-CELOE | Test-F1-Evo | RT-Evo | Test-F1-DRILL | RT-DRILL | Test-F1-TDL | RT-TDL | Test-F1-NCES | RT-NCES | Test-F1-CLIP | RT-CLIP | +|:------------------:|-------------:|--------:|--------------:|---------:|------------:|-------:|--------------:|---------:|------------:|-------:|-------------:|--------:|-------------:|--------:| +| Aunt | 0.614 | 13.697 | 0.855 | 13.697 | 0.978 | 5.278 | 0.811 | 60.351 | 0.956 | 0.118 | 0.812 | 1.168 | 0.855 | 14.059 | +| Cousin | 0.712 | 10.846 | 0.789 | 10.846 | 0.993 | 3.311 | 0.701 | 60.485 | 0.820 | 0.176 | 0.677 | 1.050 | 0.779 | 9.050 | +| Grandgranddaughter | 1.000 | 0.013 | 1.000 | 0.013 | 1.000 | 0.426 | 0.980 | 17.486 | 1.000 | 0.050 | 1.000 | 0.843 | 1.000 | 0.639 | +| Grandgrandfather | 1.000 | 0.897 | 1.000 | 0.897 | 1.000 | 0.404 | 0.947 | 55.728 | 0.947 | 0.059 | 0.927 | 0.902 | 1.000 | 0.746 | +| Grandgrandmother | 1.000 | 4.173 | 1.000 | 4.173 | 1.000 | 0.442 | 0.893 | 50.329 | 0.947 | 0.060 | 0.927 | 0.908 | 1.000 | 0.817 | +| Grandgrandson | 1.000 | 1.632 | 1.000 | 1.632 | 1.000 | 0.452 | 0.931 | 60.358 | 0.911 | 0.070 | 0.911 | 1.050 | 1.000 | 0.939 | +| Uncle | 0.876 | 16.244 | 0.891 | 16.244 | 0.964 | 4.516 | 0.876 | 60.416 | 0.933 | 0.098 | 0.891 | 1.256 | 0.928 | 17.682 | -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .2 --ratio_sample_object_prob .2 --path_report "ALCQI_semantic_seed_1_ratio_02_exits.csv" -# OWLObjectSomeValuesFrom -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .2 --ratio_sample_object_prob .2 --path_report "ALCQI_semantic_seed_1_ratio_02_forall.csv" -# OWLObjectAllValuesFrom +| LP | Train-F1-OCEL | Train-F1-CELOE | Train-F1-Evo | Train-F1-DRILL | Train-F1-TDL | Train-F1-NCES | Train-F1-CLIP | +|:------------------:|--------------:|---------------:|-------------:|---------------:|-------------:|----------------:|----------------:| +| Aunt | 0.835 | 0.918 | 0.995 | 0.837 | 1.000 | 0.804 | 0.918 | +| Cousin | 0.746 | 0.796 | 1.000 | 0.732 | 1.000 | 0.681 | 0.798 | +| Grandgranddaughter | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | +| Grandgrandfather | 1.000 | 1.000 | 1.000 | 0.968 | 1.000 | 0.973 | 1.000 | +| Grandgrandmother | 1.000 | 1.000 | 1.000 | 0.975 | 1.000 | 0.939 | 1.000 | +| Grandgrandson | 1.000 | 1.000 | 1.000 | 0.962 | 1.000 | 0.927 | 1.000 | +| Uncle | 0.904 | 0.907 | 0.996 | 0.908 | 1.000 | 0.884 | 0.940 | -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .1 --ratio_sample_object_prob .1 --path_report "ALCQI_semantic_seed_1_ratio_02_min_card.csv" -# minimum cardinality restrictions, n = {1,2,3} -python examples/retrieval_eval.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --seed 1 --ratio_sample_nc .1 --ratio_sample_object_prob .1 --path_report "ALCQI_semantic_seed_1_ratio_02_max_card.csv" -# max cardinality restrictions, n = {1,2,3} +### 10-Fold Cross Validation Mutagenesis Benchmark Results +```shell +python examples/concept_learning_cv_evaluation.py --kb ./KGs/Mutagenesis/mutagenesis.owl --lps ./LPs/Mutagenesis/lps.json --path_of_nces_embeddings ./NCESData/mutagenesis/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings ./CLIPData/mutagenesis/embeddings/ConEx_entity_embeddings.csv --max_runtime 60 --report mutagenesis_results.csv ``` -## Results from Table 3 +| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | +|:---------|--------------:|-------------:|--------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|--------------:|-------------:|--------:|--------------:|-------------:|--------:| +| NotKnown | 0.916 | 0.918 | 60.705 | 0.916 | 0.918 | 60.705 | 0.975 | 0.970 | 51.870 | 0.809 | 0.804 | 60.140 | 1.000 | 0.852 | 13.569 | 0.717 | 0.718 | 3.784 | 0.916 | 0.918 | 26.312| + -To obtain the results From Table 3, run the following commands: +### 10-Fold Cross Validation Carcinogenesis Benchmark Results ```shell -python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/father.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 -# Results of the Father dataset +python examples/concept_learning_cv_evaluation.py --kb ./KGs/Carcinogenesis/carcinogenesis.owl --lps ./LPs/Carcinogenesis/lps.json --path_of_nces_embeddings ./NCESData/carcinogenesis/embeddings/ConEx_entity_embeddings.csv --path_of_clip_embeddings ./CLIPData/carcinogenesis/embeddings/ConEx_entity_embeddings.csv --max_runtime 60 --report carcinogenesis_results.csv +``` +| LP | Train-F1-OCEL | Test-F1-OCEL | RT-OCEL | Train-F1-CELOE | Test-F1-CELOE | RT-CELOE | Train-F1-Evo | Test-F1-Evo | RT-Evo | Train-F1-DRILL | Test-F1-DRILL | RT-DRILL | Train-F1-TDL | Test-F1-TDL | RT-TDL | Train-F1-NCES | Test-F1-NCES | RT-NCES | Train-F1-CLIP | Test-F1-CLIP | RT-CLIP | +|:---------|--------------:|-------------:|--------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|---------------:|--------------:|---------:|-------------:|------------:|-------:|--------------:|-------------:|--------:|--------------:|-------------:|--------:| +| NOTKNOWN | 0.737 | 0.711 | 62.048 | 0.740 | 0.701 | 62.048 | 0.822 | 0.628 | 64.508 | 0.740 | 0.707 | 60.120 | 1.000 | 0.616 | 5.196 | 0.705 | 0.704 | 4.157 | 0.740 | 0.701 | 48.475| -python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/family-benchmark_rich_background.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 -# Results of the Family dataset -python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Semantic_bible/semantic_bible.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 --sample Yes -# Results of the Semantic Bible dataset +
-python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Mutagenesis/mutagenesis.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 --sample Yes -# Results of the Mutagenesis dataset +## Development -python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Mutagenesis/mutagenesis.owl" --level_of_incompleteness 0.4 --operation "incomplete" --number_of_incomplete_graphs 5 --sample Yes -# Results of the Carcinogenesis dataset -``` -To get the results with other ratio (0.1, 0.2, 0.6, 0.8, 0.9 etc...), just add it after the argument ```--level_of_incompleteness``` and run the same command. For results on inconcistencies, just change the argument ```--operation``` to "inconsistent" (this will not necessary make the KB inconsistent but will add noises in the data at the choosen level). See below for an example on the Father and Family datasets. -```shell -python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/father.owl" --level_of_incompleteness 0.4 --operation "inconsistent" --number_of_incomplete_graphs 5 -# Results of the Father dataset +
To see the results -python examples/retrieval_eval_under_incomplete.py --path_kg "KGs/Family/family-benchmark_rich_background.owl" --level_of_incompleteness 0.4 --operation "inconsistent" --number_of_incomplete_graphs 5 -# Results of the Family dataset -``` +Creating a feature branch **refactoring** from development branch -Or more simply, just create a bash file as shown [here](run_multiple_carcinogenesis.sh) for the carcinogenesis and execute it using +```shell +git branch refactoring develop +``` -```shell -chmod +x run_multiple_carcinogenesis.sh -``` -This will make the file executable then do +Each feature branch must be merged to develop branch. To this end, the tests must run without a problem: ```shell -./run_multiple_carcinogenesis.sh +# To download knowledge graphs +wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip +# To download learning problems +wget https://files.dice-research.org/projects/Ontolearn/LPs.zip -O ./LPs.zip && unzip LPs.zip +# Download weights for some model for few tests +wget https://files.dice-research.org/projects/NCES/NCES_Ontolearn_Data/NCESData.zip -O ./NCESData.zip && unzip NCESData.zip && rm NCESData.zip +wget https://files.dice-research.org/projects/Ontolearn/CLIP/CLIPData.zip && unzip CLIPData.zip && rm CLIPData.zip +pytest -p no:warnings -x # Running 76 tests takes ~ 17 mins ``` -This will run the carcinogenesis data with different level of inconsistencies. -## Example of Concepts retrieval results on Father dataset: -| | Expression | Type | Jaccard Similarity | F1 | Runtime Benefits | Runtime EBR | Symbolic Retrieval | EBR Retrieval | -|---|------------------------|--------------------------|--------------------|-----|-----------------------|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 0 | female ⊓ male | OWLObjectIntersectionOf | 1.0 | 1.0 | 0.054 | 0.003 | set() | set() | -| 1 | ∃ hasChild.female | OWLObjectSomeValuesFrom | 1.0 | 1.0 | -0.001 | 0.001 | {'http://example.com/father#markus'} | {'http://example.com/father#markus'} | -| 2 | person ⊔ (¬person) | OWLObjectUnionOf | 1.0 | 1.0 | -0.003 | 0.003 | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | -| 3 | person ⊓ person | OWLObjectIntersectionOf | 1.0 | 1.0 | -0.002 | 0.002 | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | -| 4 | person ⊔ person | OWLObjectUnionOf | 1.0 | 1.0 | -0.002 | 0.002 | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#markus', 'http://example.com/father#anna', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | {'http://example.com/father#martin', 'http://example.com/father#stefan', 'http://example.com/father#anna', 'http://example.com/father#markus', 'http://example.com/father#michelle', 'http://example.com/father#heinz'} | + +
+ +## References +Currently, we are working on our manuscript describing our framework. +If you find our work useful in your research, please consider citing the respective paper: +``` +# DRILL +@inproceedings{demir2023drill, + author = {Demir, Caglar and Ngomo, Axel-Cyrille Ngonga}, + booktitle = {The 32nd International Joint Conference on Artificial Intelligence, IJCAI 2023}, + title = {Neuro-Symbolic Class Expression Learning}, + url = {https://www.ijcai.org/proceedings/2023/0403.pdf}, + year={2023} +} + +# NCES2 +@inproceedings{kouagou2023nces2, +author={Kouagou, N'Dah Jean and Heindorf, Stefan and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, +title={Neural Class Expression Synthesis in ALCHIQ(D)}, +url = {https://papers.dice-research.org/2023/ECML_NCES2/NCES2_public.pdf}, +booktitle={Machine Learning and Knowledge Discovery in Databases}, +year={2023}, +publisher={Springer Nature Switzerland}, +address="Cham" +} + +# NCES +@inproceedings{kouagou2023neural, + title={Neural class expression synthesis}, + author={Kouagou, N’Dah Jean and Heindorf, Stefan and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, + booktitle={European Semantic Web Conference}, + pages={209--226}, + year={2023}, + publisher={Springer Nature Switzerland} +} + +# EvoLearner +@inproceedings{heindorf2022evolearner, + title={Evolearner: Learning description logics with evolutionary algorithms}, + author={Heindorf, Stefan and Bl{\"u}baum, Lukas and D{\"u}sterhus, Nick and Werner, Till and Golani, Varun Nandkumar and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, + booktitle={Proceedings of the ACM Web Conference 2022}, + pages={818--828}, + year={2022} +} + + +# CLIP +@inproceedings{kouagou2022learning, + title={Learning Concept Lengths Accelerates Concept Learning in ALC}, + author={Kouagou, N’Dah Jean and Heindorf, Stefan and Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, + booktitle={European Semantic Web Conference}, + pages={236--252}, + year={2022}, + publisher={Springer Nature Switzerland} +} +``` + +In case you have any question, please contact: ```caglar.demir@upb.de``` or ```caglardemir8@gmail.com``` diff --git a/examples/incomplete_kb.py b/examples/incomplete_kb.py deleted file mode 100644 index 7dbf2e0f..00000000 --- a/examples/incomplete_kb.py +++ /dev/null @@ -1,320 +0,0 @@ -from owlready2 import * -import random - - - -# def make_kb_incomplete(kb_path, output_path, rate, seed): -# """ -# Makes the knowledge base incomplete by removing a certain percentage of statements (triples). - -# Inputs: -# --------------- - -# kb_path: Path to the input knowledge base. -# output_path: Path to save the modified (incomplete) knowledge base. -# rate: Percentage of statements to remove (0-100). -# seed: random seed for reproducibility. - -# Output: -# --------------- - -# Incomplete KB at level rate % -# """ - -# random.seed(seed) - -# # Load the ontology -# kb = get_ontology(kb_path).load() - -# # Get all individuals in the ontology -# all_individuals = list(kb.individuals()) - -# # Collect all triples (subject-predicate-object) related to the individuals -# all_triples = [] -# for individual in all_individuals: -# for prop in individual.get_properties(): -# for value in prop[individual]: -# all_triples.append((individual, prop, value)) - -# # Calculate the number of triples to remove based on the rate -# num_to_remove = int(len(all_triples) * (rate / 100)) - -# # Randomly select triples to remove -# triples_to_remove = random.sample(all_triples, num_to_remove) - -# print(len(triples_to_remove)) -# # exit(0) - -# # Remove the selected triples -# for subject, predicate, obj in triples_to_remove: - - - -# predicate[subject].remove(obj) - - - -# # Save the modified ontology to a new file -# kb.save(file=output_path, format="rdfxml") - - - - - -def make_kb_incomplete(kb_path, output_path, rate, seed): - """ - Makes the knowledge base incomplete by removing a certain percentage of individuals. - - - Inputs: - --------------- - - kb_path: Path to the input knowledge base. - output_path: Path to save the modified (incomplete) knowledge base. - rate: Percentage of individuals to remove (0-100). - seed: random seed for reproducibility. - - Output: - --------------- - - Incomplete KB at level rate % - """ - - random.seed(seed) - - # Load the ontology - kb = get_ontology(kb_path).load() - - # Get all individuals (instances) in the ABox - all_individuals = list(kb.individuals()) - - # Calculate the number of individuals to remove based on the rate - num_to_remove = int(len(all_individuals) * (rate / 100)) - - # Randomly select individuals to remove - individuals_to_remove = random.sample(all_individuals, num_to_remove) - - # Remove the selected individuals - for individual in individuals_to_remove: - destroy_entity(individual) - - # Save the modified ontology to a new file - kb.save(file=output_path, format="rdfxml") - - -# seed = 1 -# rate = 10 -# kb_path = "KGs/Family/father.owl" -# output_path = f"incomplete_father_{rate}.owl" - - -def make_kb_inconsistent(kb_path, output_path, rate, seed, max_attempts=100): - """ - This function makes the knowledge base (KB) inconsistent by introducing incorrect statements. - - Parameters: - kb_path (str): Path to the original OWL ontology file. - output_path (str): Path to save the inconsistent ontology file. - rate (float): Percentage of incorrect statements to introduce (0-100). - seed (int): Seed for reproducibility. - max_attempts (int): Maximum attempts to find a valid incorrect statement. - """ - - # Set the random seed for reproducibility - random.seed(seed) - - # Load the ontology - onto = get_ontology(kb_path).load() - - # Get all individuals, classes, and properties - all_individuals = list(onto.individuals()) - all_classes = list(onto.classes()) - all_object_properties = list(onto.object_properties()) - all_data_properties = list(onto.data_properties()) - - def count_triples(): - """Count the number of triples (statements) in the ontology.""" - return len(list(onto.world.sparql(""" - SELECT ?s ?p ?o - WHERE { - ?s ?p ?o . - } - """))) - - def generate_incorrect_class_assertion(individual): - """Generate an incorrect class assertion by adding a disjoint or contradictory class.""" - class_candidates = [cls for cls in all_classes if cls not in individual.is_a] - if not class_candidates: - return None - - selected_class = random.choice(class_candidates) - individual.is_a.append(selected_class) - print(f"Added incorrect class assertion: {individual} rdf:type {selected_class}") - return f"Added incorrect class assertion: {individual} rdf:type {selected_class}" - - def generate_incorrect_object_property(individual): - """Generate an incorrect object property assertion.""" - prop = random.choice(all_object_properties) - incorrect_object = random.choice(all_individuals) - - if incorrect_object not in prop[individual]: - prop[individual].append(incorrect_object) - print(f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}") - return f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}" - - def generate_incorrect_data_property(individual): - - """Generate an incorrect data property assertion (if exist in the KB).""" - if len(all_data_properties) != 0: - prop = random.choice(all_data_properties) - incorrect_value = "inconsistent_value" # Example of an incorrect data value - - if incorrect_value not in prop[individual]: - setattr(individual, prop.name, incorrect_value) - print(f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}") - return f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}" - - - - def insert_incorrect_statements(): - """Insert incorrect statements based on the specified rate.""" - num_triples = count_triples() # Use the total number of triples in the KB - num_incorrect = int(num_triples * (rate / 100)) - - # print(num_triples) - - incorrect_statements = [] - - for _ in range(num_incorrect): - attempts = 0 - while attempts < max_attempts: - individual = random.choice(all_individuals) - statement_type = random.choice(['class', 'object_property']) #could also add data properties later on - - if statement_type == 'class': - result = generate_incorrect_class_assertion(individual) - elif statement_type == 'object_property': - result = generate_incorrect_object_property(individual) - - - if result: - incorrect_statements.append(result) - break - - attempts += 1 - - # print(len(incorrect_statements)) - # exit(0) - - - return incorrect_statements - - # Insert incorrect statements - inconsistencies = insert_incorrect_statements() - - # Save the modified ontology - onto.save(file=output_path, format="rdfxml") - - # Return the list of inconsistencies added (for logging or debugging purposes) - return inconsistencies - - -# def make_kb_inconsistent(kb_path, output_path, rate, seed, max_attempts=100): - """ - This function makes the knowledge base (KB) inconsistent by introducing incorrect statements. - - Parameters: - kb_path (str): Path to the original OWL ontology file. - output_path (str): Path to save the inconsistent ontology file. - rate (float): Percentage of incorrect statements to introduce (0-100). - seed (int): Seed for reproducibility. - max_attempts (int): Maximum attempts to find a valid incorrect statement. - """ - - # Set the random seed for reproducibility - random.seed(seed) - - # Load the ontology - onto = get_ontology(kb_path).load() - - # Get all individuals, classes, and properties - all_individuals = list(onto.individuals()) - all_classes = list(onto.classes()) - all_object_properties = list(onto.object_properties()) - all_data_properties = list(onto.data_properties()) - - def generate_incorrect_class_assertion(individual): - """Generate an incorrect class assertion by adding a disjoint or contradictory class.""" - class_candidates = [cls for cls in all_classes if cls not in individual.is_a] - if not class_candidates: - return None - - selected_class = random.choice(class_candidates) - individual.is_a.append(selected_class) - print(f"Added incorrect class assertion: {individual} rdf:type {selected_class}") - return f"Added incorrect class assertion: {individual} rdf:type {selected_class}" - - def generate_incorrect_object_property(individual): - """Generate an incorrect object property assertion.""" - prop = random.choice(all_object_properties) - incorrect_object = random.choice(all_individuals) - - if incorrect_object not in prop[individual]: - prop[individual].append(incorrect_object) - print(f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}") - return f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}" - - def generate_incorrect_data_property(individual): - - """Generate an incorrect data property assertion (if exist in the KB).""" - if len(all_data_properties) != 0: - prop = random.choice(all_data_properties) - incorrect_value = "inconsistent_value" # Example of an incorrect data value - - if incorrect_value not in prop[individual]: - setattr(individual, prop.name, incorrect_value) - print(f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}") - return f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}" - - - - def insert_incorrect_statements(): - """Insert incorrect statements based on the specified rate.""" - num_statements = len(list(onto.individuals())) # Approximation for total statements - num_incorrect = int(num_statements * (rate / 100)) - - # print(num_incorrect) - # exit(0) - - incorrect_statements = [] - - for _ in range(num_incorrect): - attempts = 0 - while attempts < max_attempts: - individual = random.choice(all_individuals) - statement_type = random.choice(['class', 'object_property', 'data_property']) - - if statement_type == 'class': - result = generate_incorrect_class_assertion(individual) - elif statement_type == 'object_property': - result = generate_incorrect_object_property(individual) - elif statement_type == 'data_property': - result = generate_incorrect_data_property(individual) - - if result: - incorrect_statements.append(result) - break - - attempts += 1 - print(len(incorrect_statements)) - exit(0) - return incorrect_statements - - # Insert incorrect statements - inconsistencies = insert_incorrect_statements() - - # Save the modified ontology - onto.save(file=output_path, format="rdfxml") - - return inconsistencies - diff --git a/examples/retrieval_eval.py b/examples/retrieval_eval.py index 82565169..a757a791 100644 --- a/examples/retrieval_eval.py +++ b/examples/retrieval_eval.py @@ -62,6 +62,8 @@ def execute(args): object_properties_and_inverse = object_properties.union(object_properties_inverse) # (6) NC: Named owl concepts. nc = sorted({i for i in symbolic_kb.get_concepts()}) + + if args.ratio_sample_nc: @@ -182,7 +184,7 @@ def concept_retrieval(retriever_func, c) -> Tuple[Set[str], float]: ) # () Read the data into pandas dataframe df = pd.DataFrame(data) - # assert df["Jaccard Similarity"].mean() == 1.0 + assert df["Jaccard Similarity"].mean() >= args.min_jaccard_similarity # () Save the experimental results into csv file. df.to_csv(args.path_report) del df @@ -211,6 +213,7 @@ def get_default_arguments(): parser.add_argument("--seed", type=int, default=1) parser.add_argument("--ratio_sample_nc", type=float, default=0.2, help="To sample OWL Classes.") parser.add_argument("--ratio_sample_object_prob", type=float, default=0.1, help="To sample OWL Object Properties.") + parser.add_argument("--min_jaccard_similarity", type=float, default=0.0, help="Minimum Jaccard similarity to be achieve by the reasoner") # H is obtained if the forward chain is applied on KG. parser.add_argument("--path_report", type=str, default="ALCQHI_Retrieval_Results.csv") return parser.parse_args() diff --git a/examples/retrieval_eval_under_incomplete.py b/examples/retrieval_eval_under_incomplete.py index 65656891..2a1d544e 100644 --- a/examples/retrieval_eval_under_incomplete.py +++ b/examples/retrieval_eval_under_incomplete.py @@ -1,11 +1,21 @@ -"""python examples/retrieval_eval_under_incomplete.py""" +"""The goal of this script is to perform retrieval task on inconsistent or incomplete KB. + Given and input KB, we first generate a number of subgraphs that are either incomplete + or inconsistent. Each subgraph is then evaluated by running a retrieval task, using + using a neural method or different symbolic reasoners (HermiT, Pellet, JFact, and Openllet). + for each subgraph, the script computes and records Jaccard similarity scores between + the retrieval results of each reasoner and the expected goal, as well as their runtime. + The result is then save as a csv file for further investigation. + + To run the script: python examples/retrieval_eval_under_incomplete.py""" + + from argparse import ArgumentParser from ontolearn.knowledge_base import KnowledgeBase import pandas as pd from typing import Set import time -from incomplete_kb import make_kb_incomplete, make_kb_inconsistent +from ontolearn.incomplete_kb import make_kb_incomplete, make_kb_inconsistent import os from ontolearn.utils import jaccard_similarity import subprocess @@ -19,7 +29,7 @@ import pandas as pd -# [] Create sub/incomplete KGs +# Create incomplete/noisy KGs def generate_subgraphs(kb_path: str, directory: str, n: int, ratio: float, operation: str) -> Set[str]: """ diff --git a/ontolearn/incomplete_kb.py b/ontolearn/incomplete_kb.py new file mode 100644 index 00000000..4a4574f0 --- /dev/null +++ b/ontolearn/incomplete_kb.py @@ -0,0 +1,203 @@ +from owlready2 import * +import random +from typing import Set + + +def make_kb_incomplete_ass(kb_path, output_path, rate, seed): + """ + Makes the knowledge base incomplete by removing a certain percentage of statements (triples). + + Inputs: + --------------- + + kb_path: Path to the input knowledge base. + output_path: Path to save the modified (incomplete) knowledge base. + rate: Percentage of statements to remove (0-100). + seed: random seed for reproducibility. + + Output: + --------------- + + Incomplete KB at level rate % + """ + + random.seed(seed) + + # Load the ontology + kb = get_ontology(kb_path).load() + + # Get all individuals in the ontology + all_individuals = list(kb.individuals()) + + # Collect all triples (subject-predicate-object) related to the individuals + all_triples = [] + for individual in all_individuals: + for prop in individual.get_properties(): + for value in prop[individual]: + all_triples.append((individual, prop, value)) + + # Calculate the number of triples to remove based on the rate + num_to_remove = int(len(all_triples) * (rate / 100)) + + # Randomly select triples to remove + triples_to_remove = random.sample(all_triples, num_to_remove) + + # Remove the selected triples + for subject, predicate, obj in triples_to_remove: + + predicate[subject].remove(obj) + + # Save the modified ontology to a new file + kb.save(file=output_path, format="rdfxml") + + + + + +def make_kb_incomplete(kb_path, output_path, rate, seed)-> Set[str]: + """ + Makes the knowledge base incomplete by removing a certain percentage of individuals. + + + Inputs: + --------------- + + kb_path: Path to the input knowledge base. + output_path: Path to save the modified (incomplete) knowledge base. + rate: Percentage of individuals to remove (0-100). + seed: random seed for reproducibility. + + Output: + --------------- + + Incomplete KB at level rate % + """ + + random.seed(seed) + + # Load the ontology + kb = get_ontology(kb_path).load() + + # Get all individuals (instances) in the ABox + all_individuals = list(kb.individuals()) + + # Calculate the number of individuals to remove based on the rate + num_to_remove = int(len(all_individuals) * (rate / 100)) + + # Randomly select individuals to remove + individuals_to_remove = random.sample(all_individuals, num_to_remove) + + # Remove the selected individuals + for individual in individuals_to_remove: + destroy_entity(individual) + + # Save the modified ontology to a new file + kb.save(file=output_path, format="rdfxml") + + +def make_kb_inconsistent(kb_path, output_path, rate, seed, max_attempts=100): + """ + This function makes the knowledge base (KB) inconsistent by introducing incorrect statements. + + Parameters: + kb_path (str): Path to the original OWL ontology file. + output_path (str): Path to save the inconsistent ontology file. + rate (float): Percentage of incorrect statements to introduce (0-100). + seed (int): Seed for reproducibility. + max_attempts (int): Maximum attempts to find a valid incorrect statement. + """ + + # Set the random seed for reproducibility + random.seed(seed) + + # Load the ontology + onto = get_ontology(kb_path).load() + + # Get all individuals, classes, and properties + all_individuals = list(onto.individuals()) + all_classes = list(onto.classes()) + all_object_properties = list(onto.object_properties()) + all_data_properties = list(onto.data_properties()) + + def count_triples(): + """Count the number of triples (statements) in the ontology.""" + return len(list(onto.world.sparql(""" + SELECT ?s ?p ?o + WHERE { + ?s ?p ?o . + } + """))) + + def generate_incorrect_class_assertion(individual): + """Generate an incorrect class assertion by adding a disjoint or contradictory class.""" + class_candidates = [cls for cls in all_classes if cls not in individual.is_a] + if not class_candidates: + return None + + selected_class = random.choice(class_candidates) + individual.is_a.append(selected_class) + print(f"Added incorrect class assertion: {individual} rdf:type {selected_class}") + return f"Added incorrect class assertion: {individual} rdf:type {selected_class}" + + def generate_incorrect_object_property(individual): + """Generate an incorrect object property assertion.""" + prop = random.choice(all_object_properties) + incorrect_object = random.choice(all_individuals) + + if incorrect_object not in prop[individual]: + prop[individual].append(incorrect_object) + print(f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}") + return f"Added incorrect object property assertion: {individual} {prop.name} {incorrect_object}" + + def generate_incorrect_data_property(individual): + + """Generate an incorrect data property assertion (if exist in the KB).""" + if len(all_data_properties) != 0: + prop = random.choice(all_data_properties) + incorrect_value = "inconsistent_value" # Example of an incorrect data value + + if incorrect_value not in prop[individual]: + setattr(individual, prop.name, incorrect_value) + print(f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}") + return f"Added incorrect data property assertion: {individual} {prop.name} {incorrect_value}" + + + + def insert_incorrect_statements(): + """Insert incorrect statements based on the specified rate.""" + num_triples = count_triples() # Use the total number of triples in the KB + num_incorrect = int(num_triples * (rate / 100)) + + incorrect_statements = [] + + for _ in range(num_incorrect): + attempts = 0 + while attempts < max_attempts: + individual = random.choice(all_individuals) + statement_type = random.choice(['class', 'object_property']) #could also add data properties later on + + if statement_type == 'class': + result = generate_incorrect_class_assertion(individual) + elif statement_type == 'object_property': + result = generate_incorrect_object_property(individual) + + + if result: + incorrect_statements.append(result) + break + + attempts += 1 + + return incorrect_statements + + # Insert incorrect statements + inconsistencies = insert_incorrect_statements() + + # Save the modified ontology + onto.save(file=output_path, format="rdfxml") + + # Return the list of inconsistencies added + return inconsistencies + + + \ No newline at end of file diff --git a/run_multiple_carcinogenesis.sh b/run_multiple_carcinogenesis.sh deleted file mode 100644 index f7ce02df..00000000 --- a/run_multiple_carcinogenesis.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -#You can directly choose the needed operation inside the file examples/retrieval_eval_under_incomplete.py. - -# Define the path to your Python script -PYTHON_SCRIPT="examples/retrieval_eval_under_incomplete.py" -path_kg="KGs/Carcinogenesis/carcinogenesis.owl" - -# Define the number of incomplete graphs -NUMBER_OF_INCOMPLETE_GRAPHS=5 - -# Define the list of levels of incompleteness -LEVELS_OF_INCOMPLETENESS=("0.4" "0.8" "0.9") -# LEVELS_OF_INCOMPLETENESS=("0.1") - -# Iterate over each level of incompleteness -for LEVEL in "${LEVELS_OF_INCOMPLETENESS[@]}"; do - echo "Running with level_of_incompleteness=$LEVEL..." - python $PYTHON_SCRIPT --number_of_incomplete_graphs $NUMBER_OF_INCOMPLETE_GRAPHS --level_of_incompleteness $LEVEL --path_kg $path_kg - echo "Completed with level_of_incompleteness=$LEVEL." -done - -echo "All tasks completed." From f09079f00013763166c700e84060208333d622c7 Mon Sep 17 00:00:00 2001 From: Luke Friedrichs Date: Fri, 18 Oct 2024 10:03:06 +0200 Subject: [PATCH 12/12] refactoring of f1_set_similarity function --- ontolearn/utils/static_funcs.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ontolearn/utils/static_funcs.py b/ontolearn/utils/static_funcs.py index b48e1c55..f0832c17 100644 --- a/ontolearn/utils/static_funcs.py +++ b/ontolearn/utils/static_funcs.py @@ -65,8 +65,13 @@ def f1_set_similarity(y: Set[str], yhat: Set[str]) -> float: if len(yhat) == 0 or len(y) == 0: return 0.0 - precision = len(y.intersection(yhat)) / len(y) - recall = len(y.intersection(yhat)) / len(yhat) + tp = len(y.intersection(yhat)) + fp = len(yhat.difference(y)) + fn = len(y.difference(yhat)) + + precision = tp / (tp + fp) + recall = tp / (tp + fn) + if precision == 0 and recall == 0: return 0.0