Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow unbound variables in compliance with the SPARQL standard #1586

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/ServerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ int main(int argc, char** argv) {
optionFactory.getProgramOption<"service-max-value-rows">(),
"The maximal number of result rows to be passed to a SERVICE operation "
"as a VALUES clause to optimize its computation.");
add("throw-on-unbound-variables",
optionFactory.getProgramOption<"throw-on-unbound-variables">(),
"If set to true, the queries that use GROUP BY, BIND, or ORDER BY with "
"variables that are unbound in the query throw an exception. These "
"queries technically are allowed by the SPARQL standard, but typically "
"are the result of typos and unintended by the user");
po::variables_map optionsMap;

try {
Expand Down
41 changes: 30 additions & 11 deletions src/engine/GroupBy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,23 @@

using groupBy::detail::VectorOfAggregationData;

// _______________________________________________________________________________________________
// _____________________________________________________________________________
GroupBy::GroupBy(QueryExecutionContext* qec, vector<Variable> groupByVariables,
std::vector<Alias> aliases,
std::shared_ptr<QueryExecutionTree> subtree)
: Operation{qec},
_groupByVariables{std::move(groupByVariables)},
_aliases{std::move(aliases)} {
// Sort the groupByVariables to ensure that the cache key is order
// invariant.
// Note: It is tempting to do the same also for the aliases, but that would
AD_CORRECTNESS_CHECK(subtree != nullptr);
// Remove all undefined GROUP BY variables (according to the SPARQL standard
// they are allowed, but have no effect on the result).
std::erase_if(_groupByVariables,
[&map = subtree->getVariableColumns()](const auto& var) {
return !map.contains(var);
});
// Sort `groupByVariables` to ensure that the cache key is order invariant.
//
// NOTE: It is tempting to do the same also for the aliases, but that would
// break the case when an alias reuses a variable that was bound by a previous
// alias.
std::ranges::sort(_groupByVariables, std::less<>{}, &Variable::name);
Expand Down Expand Up @@ -601,11 +608,12 @@ std::optional<IdTable> GroupBy::computeGroupByForSingleIndexScan() const {

IdTable table{1, getExecutionContext()->getAllocator()};
table.emplace_back();
// For `IndexScan`s with at least two variables the size estimates are
// exact as they are read directly from the index metadata.
if (indexScan->getResultWidth() == 3) {
const auto& var = varAndDistinctness.value().variable_;
if (!isVariableBoundInSubtree(var)) {
// The variable is never bound, so its count is zero.
table(0, 0) = Id::makeFromInt(0);
} else if (indexScan->getResultWidth() == 3) {
if (countIsDistinct) {
const auto& var = varAndDistinctness.value().variable_;
auto permutation =
getPermutationForThreeVariableTriple(*_subtree, var, var);
AD_CONTRACT_CHECK(permutation.has_value());
Expand All @@ -615,8 +623,6 @@ std::optional<IdTable> GroupBy::computeGroupByForSingleIndexScan() const {
table(0, 0) = Id::makeFromInt(getIndex().numTriples().normal);
}
} else {
// TODO<joka921> The two variables IndexScans should also account for the
// additionally added triples.
table(0, 0) = Id::makeFromInt(indexScan->getExactSize());
}
return table;
Expand Down Expand Up @@ -692,15 +698,19 @@ std::optional<IdTable> GroupBy::computeGroupByForFullIndexScan() const {
// Check that all the aliases are non-distinct counts. We currently support
// only one or no such count. Redundant additional counts will lead to an
// exception (it is easy to reformulate the query to trigger this
// optimization).
// optimization). Also keep track of whether the counted variable is actually
// bound by the index scan (else all counts will be 0).
size_t numCounts = 0;
bool variableIsBoundInSubtree = true;
for (size_t i = 0; i < _aliases.size(); ++i) {
const auto& alias = _aliases[i];
if (auto count = alias._expression.getVariableForCount()) {
if (count.value().isDistinct_) {
return std::nullopt;
}
numCounts++;
variableIsBoundInSubtree =
isVariableBoundInSubtree(count.value().variable_);
} else {
return std::nullopt;
}
Expand All @@ -722,6 +732,10 @@ std::optional<IdTable> GroupBy::computeGroupByForFullIndexScan() const {
cancellationHandle_, locatedTriplesSnapshot());
if (numCounts == 0) {
table.setColumnSubset({{0}});
} else if (!variableIsBoundInSubtree) {
// The variable inside the COUNT() is not part of the input, so it is always
// unbound and has a count of 0 in each group.
std::ranges::fill(table.getColumn(1), Id::makeFromInt(0));
}

// TODO<joka921> This optimization should probably also apply if
Expand Down Expand Up @@ -1567,3 +1581,8 @@ GroupBy::getVariableForCountOfSingleAlias() const {
? _aliases.front()._expression.getVariableForCount()
: std::nullopt;
}

// _____________________________________________________________________________
bool GroupBy::isVariableBoundInSubtree(const Variable& variable) const {
return _subtree->getVariableColumnOrNullopt(variable).has_value();
}
4 changes: 4 additions & 0 deletions src/engine/GroupBy.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,10 @@ class GroupBy : public Operation {
sparqlExpression::SparqlExpressionPimpl::VariableAndDistinctness>
getVariableForCountOfSingleAlias() const;

// Return true if the `variable` is part of the result of the subtree of this
// GROUP BY. This is used by some of the optimizations above.
bool isVariableBoundInSubtree(const Variable& variable) const;

// TODO<joka921> implement optimization when *additional* Variables are
// grouped.

Expand Down
12 changes: 9 additions & 3 deletions src/engine/Operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@
// recursively collect all Warnings generated by all descendants
vector<string> collectWarnings() const;

// Add a warning to the `Operation`. The warning will be returned by
// `collectWarnings()` above.
void addWarning(std::string warning) {
warnings_.push_back(std::move(warning));
}

/**
* @return A list of columns on which the result of this operation is sorted.
*/
Expand Down Expand Up @@ -243,9 +249,9 @@
[[nodiscard]] virtual vector<ColumnIndex> resultSortedOn() const = 0;

/// interface to the generated warnings of this operation
std::vector<std::string>& getWarnings() { return _warnings; }
std::vector<std::string>& getWarnings() { return warnings_; }

Check warning on line 252 in src/engine/Operation.h

View check run for this annotation

Codecov / codecov/patch

src/engine/Operation.h#L252

Added line #L252 was not covered by tests
[[nodiscard]] const std::vector<std::string>& getWarnings() const {
return _warnings;
return warnings_;
}

// Check if the cancellation flag has been set and throw an exception if
Expand Down Expand Up @@ -374,7 +380,7 @@

// Collect all the warnings that were created during the creation or
// execution of this operation.
std::vector<std::string> _warnings;
std::vector<std::string> warnings_;

// The limit from a SPARQL `LIMIT` clause.

Expand Down
13 changes: 11 additions & 2 deletions src/engine/QueryExecutionTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,20 @@ std::string QueryExecutionTree::getCacheKey() const {

// _____________________________________________________________________________
size_t QueryExecutionTree::getVariableColumn(const Variable& variable) const {
auto optIdx = getVariableColumnOrNullopt(variable);
if (!optIdx.has_value()) {
AD_THROW("Variable could not be mapped to result column. Var: " +
variable.name());
}
return optIdx.value();
}
// _____________________________________________________________________________
std::optional<size_t> QueryExecutionTree::getVariableColumnOrNullopt(
const Variable& variable) const {
AD_CONTRACT_CHECK(rootOperation_);
const auto& varCols = getVariableColumns();
if (!varCols.contains(variable)) {
AD_THROW("Variable could not be mapped to result column. Var: " +
variable.name());
return std::nullopt;
}
return varCols.at(variable).columnIndex_;
}
Expand Down
7 changes: 7 additions & 0 deletions src/engine/QueryExecutionTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,15 @@ class QueryExecutionTree {

bool isEmpty() const { return !rootOperation_; }

// Get the column index that the given `variable` will have in the result of
// this query. Throw if the variable is not part of the `VariableToColumnMap`.
size_t getVariableColumn(const Variable& variable) const;

// Similar to `getVariableColumn` above, but return `nullopt` if the variable
// is not part of the `VariableToColumnMap`.
std::optional<size_t> getVariableColumnOrNullopt(
const Variable& variable) const;

size_t getResultWidth() const { return rootOperation_->getResultWidth(); }

std::shared_ptr<const Result> getResult(bool requestLaziness = false) const {
Expand Down
35 changes: 29 additions & 6 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,24 +191,35 @@ std::vector<QueryPlanner::SubtreePlan> QueryPlanner::createExecutionTrees(

checkCancellation();

for (const auto& warning : pq.warnings()) {
warnings_.push_back(warning);
}
return lastRow;
}

// _____________________________________________________________________
// _____________________________________________________________________________
QueryExecutionTree QueryPlanner::createExecutionTree(ParsedQuery& pq,
bool isSubquery) {
try {
auto lastRow = createExecutionTrees(pq, isSubquery);
auto minInd = findCheapestExecutionTree(lastRow);
LOG(DEBUG) << "Done creating execution plan.\n";
return *lastRow[minInd]._qet;
LOG(DEBUG) << "Done creating execution plan" << std::endl;
auto result = std::move(*lastRow[minInd]._qet);
auto& rootOperation = *result.getRootOperation();
// Collect all the warnings and pass them to the created tree such that
// they become visible to the user once the query is executed.
for (const auto& warning : warnings_) {
rootOperation.addWarning(warning);
}
warnings_.clear();
return result;
} catch (ad_utility::CancellationException& e) {
e.setOperation("Query planning");
throw;
}
}

// _____________________________________________________________________
// _____________________________________________________________________________
std::vector<QueryPlanner::SubtreePlan> QueryPlanner::optimize(
ParsedQuery::GraphPattern* rootPattern) {
QueryPlanner::GraphPatternPlanner optimizer{*this, rootPattern};
Expand Down Expand Up @@ -387,9 +398,21 @@ vector<QueryPlanner::SubtreePlan> QueryPlanner::getOrderByRow(
plan._idsOfIncludedFilters = parent._idsOfIncludedFilters;
plan.idsOfIncludedTextLimits_ = parent.idsOfIncludedTextLimits_;
vector<pair<ColumnIndex, bool>> sortIndices;
// Collect the variables of the ORDER BY or INTERNAL SORT BY clause. Ignore
// variables that are not visible in the query body (according to the
// SPARQL standard, they are allowed but have no effect).
for (auto& ord : pq._orderBy) {
sortIndices.emplace_back(parent._qet->getVariableColumn(ord.variable_),
ord.isDescending_);
auto idx = parent._qet->getVariableColumnOrNullopt(ord.variable_);
if (!idx.has_value()) {
continue;
}
sortIndices.emplace_back(idx.value(), ord.isDescending_);
}

// If none of these variables was bound, we can omit the whole ORDER BY
// or INTERNAL SORT BY clause.
if (sortIndices.empty()) {
return previous;
}

if (pq._isInternalSort == IsInternalSort::True) {
Expand Down
5 changes: 5 additions & 0 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,11 @@ class QueryPlanner {

std::optional<size_t> textLimit_ = std::nullopt;

// Used to collect warnings (created by the parser or the query planner) which
// are then passed on to the created `QueryExecutionTree` such that they can
// be reported as part of the query result if desired.
std::vector<std::string> warnings_;

[[nodiscard]] std::vector<QueryPlanner::SubtreePlan> optimize(
ParsedQuery::GraphPattern* rootPattern);

Expand Down
18 changes: 12 additions & 6 deletions src/engine/sparqlExpressions/LiteralExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class LiteralExpression : public SparqlExpression {
string getCacheKey(const VariableToColumnMap& varColMap) const override {
if constexpr (std::is_same_v<T, ::Variable>) {
if (!varColMap.contains(_value)) {
AD_THROW(absl::StrCat("Variable ", _value.name(), " not found"));
return "Unbound Variable";
}
return {"#column_" + std::to_string(varColMap.at(_value).columnIndex_) +
"#"};
Expand Down Expand Up @@ -158,18 +158,24 @@ class LiteralExpression : public SparqlExpression {
return std::move(resultFromSameRow.value());
}
}

// If the variable is not part of the input, then it is always UNDEF.
auto column = context->getColumnIndexForVariable(variable);
if (!column.has_value()) {
return Id::makeUndefined();
}
// If a variable is grouped, then we know that it always has the same
// value and can treat it as a constant. This is not possible however when
// we are inside an aggregate, because for example `SUM(?variable)` must
// still compute the sum over the whole group.
if (context->_groupedVariables.contains(variable) && !isInsideAggregate()) {
auto column = context->getColumnIndexForVariable(variable);
const auto& table = context->_inputTable;
auto constantValue = table.at(context->_beginIndex, column);
assert((std::ranges::all_of(
auto constantValue = table.at(context->_beginIndex, column.value());
AD_EXPENSIVE_CHECK((std::ranges::all_of(
table.begin() + context->_beginIndex,
table.begin() + context->_endIndex,
[&](const auto& row) { return row[column] == constantValue; })));
table.begin() + context->_endIndex, [&](const auto& row) {
return row[column.value()] == constantValue;
})));
return constantValue;
} else {
return variable;
Expand Down
6 changes: 5 additions & 1 deletion src/engine/sparqlExpressions/RegexExpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,11 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
// of binary searches and the result is a set of intervals.
std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;
if (context->isResultSortedBy(variable)) {
auto column = context->getColumnIndexForVariable(variable);
auto optColumn = context->getColumnIndexForVariable(variable);
AD_CORRECTNESS_CHECK(optColumn.has_value(),
"We have previously asserted that the input is sorted "
"by the variable, so we expect it to exist");
const auto& column = optColumn.value();
for (auto [lowerId, upperId] : lowerAndUpperIds) {
// Two binary searches to find the lower and upper bounds of the range.
auto lower = std::lower_bound(
Expand Down
6 changes: 5 additions & 1 deletion src/engine/sparqlExpressions/RelationalExpressions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,13 @@ ad_utility::SetOfIntervals evaluateWithBinarySearch(
// Set up iterators into the `IdTable` that only access the column where the
// `variable` is stored.
auto columnIndex = context->getColumnIndexForVariable(variable);
AD_CORRECTNESS_CHECK(
columnIndex.has_value(),
"The input must be sorted to evaluate a relational expression using "
"binary search. This should never happen for unbound variables");

auto getIdFromColumn = ad_utility::makeAssignableLambda(
[columnIndex](const auto& idTable, auto i) {
[columnIndex = columnIndex.value()](const auto& idTable, auto i) {
return idTable(i, columnIndex);
});

Expand Down
7 changes: 3 additions & 4 deletions src/engine/sparqlExpressions/SparqlExpressionTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@ bool EvaluationContext::isResultSortedBy(const Variable& variable) {
}

// _____________________________________________________________________________
[[nodiscard]] ColumnIndex EvaluationContext::getColumnIndexForVariable(
const Variable& var) const {
[[nodiscard]] std::optional<ColumnIndex>
EvaluationContext::getColumnIndexForVariable(const Variable& var) const {
const auto& map = _variableToColumnMap;
if (!map.contains(var)) {
throw std::runtime_error(absl::StrCat(
"Variable ", var.name(), " was not found in input to expression."));
return std::nullopt;
}
return map.at(var).columnIndex_;
}
Expand Down
2 changes: 1 addition & 1 deletion src/engine/sparqlExpressions/SparqlExpressionTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ struct EvaluationContext {
[[nodiscard]] size_t size() const;

// ____________________________________________________________________________
[[nodiscard]] ColumnIndex getColumnIndexForVariable(
[[nodiscard]] std::optional<ColumnIndex> getColumnIndexForVariable(
const Variable& var) const;

// _____________________________________________________________________________
Expand Down
3 changes: 2 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ inline auto& RuntimeParameters() {
Bool<"group-by-hash-map-enabled">{false},
Bool<"group-by-disable-index-scan-optimizations">{false},
SizeT<"service-max-value-rows">{10'000},
SizeT<"query-planning-budget">{1500}};
SizeT<"query-planning-budget">{1500},
Bool<"throw-on-unbound-variables">{false}};
}();
return params;
}
Expand Down
Loading
Loading