From c484f6a95315b7fa266f3554b1aecce5ccd1601e Mon Sep 17 00:00:00 2001 From: Jim Idle Date: Mon, 21 Oct 2024 06:10:20 -0600 Subject: [PATCH] Ensure that unparsable text is not lost in the generated output (#1012) The default ErrorStrategy for ANTLR generated parsers performs a sophisticated error recovery synchronization process for both PLUS_LOOP_BACK and STAR_LOOP_BACK as well as token manufacture/insertion and single token deletion within token sequences. This culminates and a call to `recover()` which finds the next token in the followSet for an alt that allows the parser to resume. We intercept `recover()` in order to record where we were unable to parse text. While errors are reported, the default error strategy does not preserve any discarded input in the generated ParseTree and so this is lost if we generate those parts of the ParseTree that were successfully generated. Here, we implement custom strategies that gather un-parsable input and preserve them as custom error nodes in the ParserTree at strategic insertion points in the higher level rules such as `sqlCommand` (in the case of Snowflake) and `sqlClauses` in the case of TSQL. The visitors for these rules can then first check for an error node in the children and generate an Ir node representing the unparsed text. For this PR to be usable, we need our PLanParser to no longer stop when syntax errors are discovered as it is now safe to walk the ParseTree. That improvement is for a separate PR. --- .../labs/remorph/parsers/ParserCommon.scala | 16 +++++- .../remorph/parsers/SqlErrorStrategy.scala | 52 ++++++++++++++++++- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/databricks/labs/remorph/parsers/ParserCommon.scala b/core/src/main/scala/com/databricks/labs/remorph/parsers/ParserCommon.scala index 63ac515ab..86354bff2 100644 --- a/core/src/main/scala/com/databricks/labs/remorph/parsers/ParserCommon.scala +++ b/core/src/main/scala/com/databricks/labs/remorph/parsers/ParserCommon.scala @@ -3,7 +3,7 @@ package com.databricks.labs.remorph.parsers import com.databricks.labs.remorph.{intermediate => ir} import com.typesafe.scalalogging.LazyLogging import org.antlr.v4.runtime.misc.Interval -import org.antlr.v4.runtime.tree.{AbstractParseTreeVisitor, ParseTree, ParseTreeVisitor, RuleNode} +import org.antlr.v4.runtime.tree._ import org.antlr.v4.runtime.{ParserRuleContext, RuleContext, Token} import scala.collection.JavaConverters._ @@ -123,4 +123,18 @@ trait ParserCommon[A] extends ParseTreeVisitor[A] with LazyLogging { self: Abstr result } } + + /** + * If the parser recognizes a syntax error, then it generates an ErrorNode, which represents text in error + * and contains a manufactured token that encapsulates all the text that the parser ignored when it recovered + * from the error. NOt that if the error recovery strategy inserts a token rather than deletes one, then an + * error node will not be created. + * + * @param node the ErrorNode to visit + * @return The unresolved object representing the error and containing the text that was skipped + */ + override def visitErrorNode(node: ErrorNode): A = { + logger.warn(s"Error node encountered: ${node.getText}") + unresolved(node.getText, "Unparsed input - ErrorNode encountered") + } } diff --git a/core/src/main/scala/com/databricks/labs/remorph/parsers/SqlErrorStrategy.scala b/core/src/main/scala/com/databricks/labs/remorph/parsers/SqlErrorStrategy.scala index 220c1d7b6..27987fac8 100644 --- a/core/src/main/scala/com/databricks/labs/remorph/parsers/SqlErrorStrategy.scala +++ b/core/src/main/scala/com/databricks/labs/remorph/parsers/SqlErrorStrategy.scala @@ -1,7 +1,10 @@ package com.databricks.labs.remorph.parsers +import com.databricks.labs.remorph.parsers.snowflake.SnowflakeParser.SqlCommandContext +import com.databricks.labs.remorph.parsers.tsql.TSqlParser.SqlClausesContext import org.antlr.v4.runtime._ -import org.antlr.v4.runtime.misc.IntervalSet +import org.antlr.v4.runtime.misc.{Interval, IntervalSet, Pair} +import org.antlr.v4.runtime.tree.ErrorNodeImpl /** * Custom error strategy for SQL parsing

While we do not do anything super special here, we wish to override a @@ -15,6 +18,53 @@ import org.antlr.v4.runtime.misc.IntervalSet */ abstract class SqlErrorStrategy extends DefaultErrorStrategy { + @throws[RecognitionException] + override def sync(recognizer: Parser): Unit = { + val tokens: TokenStream = recognizer.getInputStream + val startIndex: Int = tokens.index + val first = tokens.LT(1) + try { + super.sync(recognizer) + } catch { + case e: RecognitionException => throw e // Throw back to parser + } finally { + val endIndex: Int = tokens.index + if (startIndex < endIndex) { + val interval = new Interval(startIndex, endIndex) + val errorToken: CommonToken = new CommonToken( + new Pair(first.getTokenSource, first.getInputStream), + Token.INVALID_TYPE, + Token.DEFAULT_CHANNEL, + first.getStartIndex, + tokens.LT(1).getStopIndex) + errorToken.setText(first.getInputStream.getText(interval)) + errorToken.setLine(first.getLine) + errorToken.setCharPositionInLine(first.getCharPositionInLine) + val errorNode = new ErrorNodeImpl(errorToken) + + // Here we add the error node to the highest level context in the tree for the particular parser, + // so that we do not have to search for error nodes in the children of every visitor context. + // If we added it to the current context, it would mean that every visitor method would + // have to check for it, which would soon become unwieldy. We may come back to that though + // if preserved skipped text is generated far away from the original text in error. + findHighestContext(recognizer.getContext).addErrorNode(errorNode) + } + } + } + + def findHighestContext(ctx: ParserRuleContext): ParserRuleContext = { + @annotation.tailrec + def findContext(currentCtx: ParserRuleContext): ParserRuleContext = { + currentCtx match { + case _: SqlClausesContext | _: SqlCommandContext => currentCtx + case _ if currentCtx.getParent == null => currentCtx + case _ => findContext(currentCtx.getParent.asInstanceOf[ParserRuleContext]) + } + } + + findContext(ctx) + } + // Note that it is not possible to get this error from the current grammar, we would have to do an inordinate // amount of mocking to raise this. It isn't worth the effort. // $COVERAGE-OFF$