-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_seed_data.py
167 lines (144 loc) · 5.77 KB
/
fetch_seed_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import logging
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import argparse
import os
# Configure logging
logging.basicConfig(
level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)
# Argument parsing
parser = argparse.ArgumentParser(description="Fetch data from DBpedia or Wikidata.")
parser.add_argument(
"dataset",
type=str,
default="wikidata",
choices=["dbpedia", "wikidata"],
help="The dataset to fetch data from ('dbpedia' or 'wikidata').",
)
args = parser.parse_args()
# Setting dataset-specific parameters based on user input
if args.dataset == "dbpedia":
sparql_endpoint = "http://dbpedia.org/sparql"
seed_file_path = "seed_dbpedia.txt"
dataset_prefixes = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
"""
elif args.dataset == "wikidata":
sparql_endpoint = "https://query.wikidata.org/sparql"
seed_file_path = "seed_wikidata.txt"
dataset_prefixes = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX bd: <http://www.bigdata.com/rdf#>
"""
# Function to read the last URI from the seed.txt file and count the entries
def read_last_uri_and_count_entries(file_path):
# Check if the file exists
if not os.path.exists(file_path):
# If the file does not exist, create it and return initial values
with open(file_path, "w", encoding="utf-8") as file:
pass # Just create the file, no need to write anything
return None, 0 # Assuming None is an appropriate initial value for last_uri
# If the file exists, proceed as before
count = 0
last_uri = None
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
last_uri = line.strip()
count += 1
return last_uri, count
# Safe query execution with retries
def safe_query(sparql, retries=3, delay=5):
for attempt in range(retries):
try:
return sparql.query().convert()
except Exception as e:
logging.error(f"Query failed on attempt {attempt+1}: {e}")
if attempt < retries - 1:
logging.info(f"Retrying in {delay} seconds...")
time.sleep(delay)
else:
raise
# Modify the get_query function to use dataset-specific prefixes and query structure
def get_query(dataset, last_uri=None, limit=10000):
filter_clause = ""
if last_uri:
filter_clause = f'FILTER (STR(?entity) > "{last_uri}")'
if dataset == "dbpedia":
return f"""
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?entity WHERE {{
?class a owl:Class .
FILTER(STRSTARTS(STR(?class), STR(dbo:)))
?entity rdf:type ?class .
?entity rdfs:label ?label .
FILTER(LANG(?label) = "en")
{filter_clause}
}}
ORDER BY ?entity
LIMIT {limit}
"""
elif dataset == "wikidata":
return f"""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX bd: <http://www.bigdata.com/rdf#>
SELECT DISTINCT ?entity WHERE {{
?class wdt:P31 wd:Q16889133. # Items that are instances of "class"
?subclass wdt:P279* ?class. # Items that are subclasses of the class
?entity wdt:P31 ?subclass. # Items that are instances of the subclass
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
FILTER EXISTS {{ ?entity rdfs:label ?label FILTER(LANG(?label) = "en") }}
{filter_clause}
}}
ORDER BY ?entity
LIMIT {limit}
"""
# Function to fetch entities and save to a file, adapted to use the dataset argument
def fetch_entities_and_save(dataset, endpoint, last_fetched_uri, filename):
sparql = SPARQLWrapper(endpoint)
sparql.setReturnFormat(JSON)
last_uri = last_fetched_uri
appended_entities = initial_entry_count
while True:
query = get_query(dataset, last_uri)
sparql.setQuery(query)
results = safe_query(sparql) # Use safe_query for error handling
if not results or "bindings" not in results["results"]:
break
batch_entities = [
result["entity"]["value"] for result in results["results"]["bindings"]
]
if not batch_entities:
break # Exit loop if no more results
with open(filename, "a", encoding="utf-8") as file:
for entity in batch_entities:
file.write(entity + "\n")
last_uri = batch_entities[-1].split("/")[
-1
] # Update for both DBpedia and Wikidata
appended_entities += len(batch_entities)
logging.debug(
f"Appended {len(batch_entities)} entities. Total entries now: {appended_entities}."
)
# Read the last URI from the seed.txt file and count the existing entries
last_fetched_uri, initial_entry_count = read_last_uri_and_count_entries(seed_file_path)
logging.info(
f"Starting with {initial_entry_count} entries already in {seed_file_path}."
)
try:
fetch_entities_and_save(
args.dataset, sparql_endpoint, last_fetched_uri, seed_file_path
)
logging.info("Finished fetching and saving all entities.")
except Exception as e:
logging.error("An error occurred", exc_info=True)