Merge pull request #10 from alexmalins/fix-cp-1252-unicode-encoding-d…

…ecoding-issues Release 0.3.1: Fix cp 1252 unicode encoding decoding issues
alexmalins · Aug 4, 2024 · c525659 · c525659
2 parents 795502d + 72b9e1f
commit c525659
Show file tree

Hide file tree

Showing 8 changed files with 523 additions and 518 deletions.
diff --git a/.github/issue_template.md b/.github/issue_template.md
@@ -13,8 +13,8 @@ Note if you have a question about usage or a feature request, use the Discussion
 
 OS version: <!-- Windows 10/Linux/macOS etc. -->
 Python version: <!-- 3.8/3.9/3.10/3.11/3.12 -->
-harlequin-databricks version: <!-- ex. 0.3.0 -->
-harlequin version: <!-- ex. 1.19.0 -->
+harlequin-databricks version: <!-- ex. 0.3.1 -->
+harlequin version: <!-- ex. 1.23.1 -->
 Installed via: <!-- pip/conda-forge -->
 
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+## [0.3.1] - 2024-08-04
+
+-   Fix `UnicodeDecodeError` on Windows due to incorrectly attempting to read `functions.csv` using
+CP-1252. Now UTF-8 is enforced on all file writes and reads (#7).
+-   Update list of Databricks SQL functions for completions (valid as of August 2, 2024).
 -   Add harlequin-databricks screenshot to README.
 
 ## [0.3.0] - 2024-04-27
@@ -56,7 +61,9 @@ is the one written with hyphens not underscores.
 
 -   Adds a Databricks adapter for SQL warehouses and DBR interactive clusters.
 
-[Unreleased]: https://github.com/alexmalins/harlequin-databricks/compare/0.3.0...HEAD
+[Unreleased]: https://github.com/alexmalins/harlequin-databricks/compare/0.3.1...HEAD
+
+[0.3.1]: https://github.com/alexmalins/harlequin-databricks/compare/0.3.0...0.3.1
 
 [0.3.0]: https://github.com/alexmalins/harlequin-databricks/compare/0.2.1...0.3.0
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "harlequin-databricks"
-version = "0.3.0"
+version = "0.3.1"
 description = "A Harlequin adapter for Databricks."
 authors = [
     "Zach Shirah <[email protected]>",
@@ -24,19 +24,19 @@ harlequin = "^1.17"
 databricks-sql-connector = "^3.0.3"
 
 [tool.poetry.group.dev.dependencies]
-black = "^24.4.2"
-ruff = "^0.4.2"
-pytest = "^8.1.2"
-mypy = "^1.10.0"
+black = "^24.8.0"
+ruff = "^0.5"
+pytest = ">=7.3.1,<9.0.0"
+mypy = "^1.11.0"
 pre-commit = "^3.5.0"
-importlib_metadata = { version = ">=7.1.0", python = "<3.10.0" }
-beautifulsoup4 = "^4.12.3"
+importlib_metadata = { version = ">=8.0", python = "<3.10.0" }
+beautifulsoup4 = "^4.12.0"
 types-beautifulsoup4 = "^4.12.0.0"
-lxml = "^5.2.1"
+lxml = "^5.2.2"
 pandas = "^2.0.0"
 pandas-stubs = "^2.0.0.230412"
-requests = "^2.31.0"
-types-requests = "^2.31.0.20240406"
+requests = "^2.32.0"
+types-requests = "^2.32.0.20240712"
 
 [tool.poetry.urls]
 Changelog = "https://github.com/alexmalins/harlequin-databricks/blob/main/CHANGELOG.md"

diff --git a/scripts/scrape_functions.py b/scripts/scrape_functions.py
@@ -2,8 +2,8 @@
 
 import pandas as pd
 
-# Last run with Databricks's SQL functions page updated on April 24, 2024. This is archived at:
-# https://web.archive.org/web/20240427141224/https://docs.databricks.com/en/sql/language-manual/sql-ref-functions-builtin.html
+# Last run with Databricks's SQL functions page updated on August 2, 2024. This is archived at:
+# https://web.archive.org/web/20240804040816/https://docs.databricks.com/en/sql/language-manual/sql-ref-functions-builtin.html
 URL = (
     "https://docs.databricks.com/en/sql/language-manual/sql-ref-functions-builtin.html"
 )
@@ -21,8 +21,8 @@ def scrape_functions_and_descriptions() -> pd.DataFrame:
 
     # Remove operators and expressions, then de-dupe
     functions_only = combined[
-        (combined["Function"].str.contains("\("))
-        & (combined["Function"].str.contains("\)"))
+        (combined["Function"].str.contains("\\("))
+        & (combined["Function"].str.contains("\\)"))
     ].copy()
     functions_only["Name"] = functions_only["Function"].str.split("(").str[0]
     deduped = functions_only.groupby("Name").first()
@@ -36,7 +36,7 @@ def main() -> None:
     # Overwrite file in ../src/harlequin_databricks/functions.csv
     # creating it if it doesn't exist
     path = Path(__file__).parents[1] / "src" / "harlequin_databricks" / "functions.csv"
-    functions.to_csv(path)
+    functions.to_csv(path, encoding="utf-8")
 
 
 if __name__ == "__main__":

diff --git a/scripts/scrape_keywords.py b/scripts/scrape_keywords.py
@@ -47,7 +47,7 @@ def main() -> None:
     # Overwrite file in ../src/harlequin_databricks/keywords.csv
     # creating it if it doesn't exist
     path = Path(__file__).parents[1] / "src" / "harlequin_databricks" / "keywords.csv"
-    with path.open("w+") as file:
+    with path.open("w+", encoding="utf-8") as file:
         writer = csv.writer(file, delimiter="\n")
         writer.writerow(keywords)
 

diff --git a/src/harlequin_databricks/completions.py b/src/harlequin_databricks/completions.py
@@ -10,7 +10,7 @@ def load_completions() -> list[HarlequinCompletion]:
     completions: list[HarlequinCompletion] = []
 
     keywords_path = Path(__file__).parent / "keywords.csv"
-    with keywords_path.open("r") as file:
+    with keywords_path.open("r", encoding="utf-8") as file:
         reader = csv.reader(file, dialect="unix")
         for row in reader:
             completions.append(
@@ -24,7 +24,7 @@ def load_completions() -> list[HarlequinCompletion]:
             )
 
     functions_path = Path(__file__).parent / "functions.csv"
-    with functions_path.open("r") as file:
+    with functions_path.open("r", encoding="utf-8") as file:
         reader = csv.reader(file, dialect="unix")
         next(reader)  # Skip header row
         for name, _, _ in reader:

diff --git a/src/harlequin_databricks/functions.csv b/src/harlequin_databricks/functions.csv
@@ -10,10 +10,11 @@ ai_analyze_sentiment,ai_analyze_sentiment(content),Returns the sentiment of a te
 ai_classify,"ai_classify(content, labels)",Classifies the provided content into one of the provided labels.
 ai_extract,"ai_extract(content, labels)",Extracts entities specified by labels from a given text.
 ai_fix_grammar,ai_fix_grammar(content),Corrects grammatical errors in a given text.
+ai_forecast,"ai_forecast(observed, time_col)",Extropolates time series data into the future.
 ai_gen,ai_gen(content),Invokes a state-of-the-art generative AI model from Databricks Foundation Model APIs to answer the user-provided prompt.
 ai_generate_text,"ai_generate_text(prompt, modelName[, param1, value1] […])",Deprecated: Returns text generated by a selected large language model (LLM) given the prompt.
 ai_mask,"ai_mask(content, labels)",Masks specified entities within a given text.
-ai_query,"ai_query(endpointName, request, returnType)",Invokes an existing Databricks Model Serving endpoint and parses and returns its response.
+ai_query,"ai_query(endpointName, request, returnType)",Invokes an existing Mosaic AI Model Serving endpoint and parses and returns its response.
 ai_similarity,"ai_similarity(strExpr1, strExpr2)",Compares two strings and computes the semantic similarity score.
 ai_summarize,"ai_summarize(content[, max_words])",Generates a summary of a given text.
 ai_translate,"ai_translate(content, to_lang)",Translates text to a specified target language.
@@ -173,8 +174,9 @@ grouping,grouping(col),"Indicates whether a specified column in a GROUPING SET,
 grouping_id,"grouping_id([col1 [, …]])",Returns the level of grouping for a set of columns.
 hash,"hash(expr1 [, …])",Returns a hashed value of the arguments.
 hex,hex(expr),Converts expr to hexadecimal.
+histogram_numeric,"histogram_numeric(expr,numBins)","Computes a histogram on expr with numBins bins, returning an array of pairs representing the bin centers."
 hll_sketch_agg,"hll_sketch_agg(expr[,lgConfigK])",Returns a HyperLogLog sketch used to approximate a distinct values count.
-hll_sketch_estimate,hll_sketch_estimate(expr),Etimates number of distinct values collected in a HyperLogLog sketch.
+hll_sketch_estimate,hll_sketch_estimate(expr),Estimates number of distinct values collected in a HyperLogLog sketch.
 hll_union,"hll_union(expr1, expr2 [,allowDifferentLgConfigK])",Combines two HyperLogLog sketches.
 hll_union_agg,"hll_union_agg(expr[,allowDifferentLgConfigK])",Aggregates HyperLogLog sketches for a group of rows.
 hour,hour(expr),Returns the hour component of a timestamp.
@@ -192,6 +194,7 @@ instr,"instr(str, substr)",Returns the (1-based) index of the first occurrence o
 int,int(expr),Casts the value expr to INTEGER.
 is_account_group_member,is_account_group_member(group),Returns true if the current user is a member of group at the account level.
 is_member,is_member(group),Returns true if the current user is a member of group at the workspace level.
+is_variant_null,is_variant_null(variantExpr),Tests whether variantExpr is a VARIANT-encoded NULL.
 isnan,isnan(expr),Returns true if expr is NaN.
 isnotnull,isnotnull(expr),Returns true if expr is not NULL.
 isnull,isnull(expr),Returns true if expr is NULL.
@@ -211,7 +214,7 @@ left,"left(str, len)",Returns the leftmost len characters from str.
 len,len(expr),Returns the character length of string data or number of bytes of binary data.
 length,length(expr),Returns the character length of string data or number of bytes of binary data.
 levenshtein,"levenshtein(str1, str2)",Returns the Levenshtein distance between the strings str1 and str2.
-list_secrets,list_secrets(),Returns the keys which the user is authorized to see from Databricks secret service.
+list_secrets,list_secrets([scopeStr]),Returns the keys in all or one scope which the user is authorized to see from Databricks secret service.
 ln,ln(expr),Returns the natural logarithm (base e) of expr.
 locate,"locate(substr, str[, pos])",Returns the position of the first occurrence of substr in str after position pos.
 log,"log([base,] expr)",Returns the logarithm of expr with base.
@@ -263,6 +266,7 @@ nvl,"nvl(expr1, expr2)","Returns expr2 if expr1 is NULL, or expr1 otherwise."
 nvl2,"nvl2(expr1, expr2, expr3)","Returns expr2 if expr1 is not NULL, or expr3 otherwise."
 octet_length,octet_length(expr),Returns the byte length of string data or number of bytes of binary data.
 overlay,overlay(input PLACING replace FROM pos [FOR len]),Replaces input with replace that starts at pos and is of length len.
+parse_json,parse_json(jsonStr),Returns a VARIANT value from the jsonStr.
 parse_url,"parse_url(url, partToExtract[, key])",Extracts a part from url.
 percent_rank,percent_rank(),Computes the percentage ranking of a value within the partition.
 percentile,"percentile(expr, percentage [,frequency])",Returns the exact percentile value of expr at the specified percentage.
@@ -322,6 +326,8 @@ rtrim,"rtrim([trimStr,] str)",Returns str with trailing characters removed.
 schema_of_csv,"schema_of_csv(csv[, options])",Returns the schema of a CSV string in DDL format.
 schema_of_json,"schema_of_json(jsonStr[, options])",Returns the schema of a JSON string in DDL format.
 schema_of_json_agg,"schema_of_json_agg(json[, options])",Returns the combined schema of JSON strings in a group in DDL format.
+schema_of_variant,schema_of_variant(variantExpr),Returns the schema of a VARIANT expression in DDL format.
+schema_of_variant_agg,schema_of_variant_agg(variantExpr),Returns the combined schema of all VARIANT values in a group in DDL format.
 schema_of_xml,"schema_of_xml(xmlStr[, options])",Returns the schema of a XML string in DDL format.
 sec,sec(expr),Returns the secant of expr.
 second,second(expr),Returns the second component of the timestamp in expr.
@@ -384,7 +390,7 @@ to_binary,"to_binary(expr[, fmt])",Returns expr cast to a Binary based on fmt.
 to_char,"to_char(numExpr, fmt)",Returns numExpr cast to STRING using formatting fmt.”
 to_csv,"to_csv(expr[, options])",Returns a CSV string with the specified struct value.
 to_date,"to_date(expr[,fmt])",Returns expr cast to a date using an optional formatting.
-to_json,"to_json(expr[, options])",Returns a JSON string with the struct specified in expr.
+to_json,"to_json(expr[, options])",Returns a JSON string with the STRUCT or VARIANT specified in expr.
 to_number,"to_number(expr, fmt )",Returns expr cast to DECIMAL using formatting fmt.
 to_timestamp,"to_timestamp(expr[,fmt])",Returns expr cast to a timestamp using an optional formatting.
 to_unix_timestamp,"to_unix_timestamp(expr[,fmt])",Returns the timestamp in expr as a UNIX timestamp.
@@ -402,13 +408,19 @@ try_avg,try_avg(expr),"Returns the mean calculated from values of a group, NULL
 try_cast,try_cast(expr AS type),Casts the value expr to the target data type type safely.
 try_divide,"try_divide(dividend, divisor)","Returns dividend divided by divisor, or NULL if divisor is 0."
 try_element_at,"try_element_at(arrayExpr, index)","Returns the element of an arrayExpr at index, or NULL if index is out of bound."
+try_mod,"try_mod(dividend, divisor)","Returns the remainder after dividend / divisor, or NULL if divisor is 0.."
 try_multiply,"try_multiply(multiplier, multiplicand)","Returns multiplier multiplied by multiplicand, or NULL on overflow."
+try_parse_json,try_parse_json(jsonStr),"Returns a VARIANT value from the jsonStr if possible. If not possible, NULL is returned."
 try_reflect,"try_reflect(class, method[, arg1 [, …]])","Calls a method with reflection, returning NULL if the method fails."
+try_secret,"try_secret(scope, key)","Extracts a secret value with the given scope and key from Databricks secret service, or NULL if the key cannot be retrieved."
 try_subtract,"try_subtract(expr1, expr2)","Returns the subtraction of expr2 from expr1, or NULL on overflow."
 try_sum,try_sum(expr),"Returns the sum calculated from values of a group, NULL if there is an overflow."
 try_to_binary,"try_to_binary(expr [, fmt])","Returns expr cast to BINARY based on fmt, or NULL if the input is invalid."
 try_to_number,"try_to_number(expr, fmt )","Returns expr cast to DECIMAL using formatting fmt, or NULL if expr does not match the format."
 try_to_timestamp,"try_to_timestamp(expr[,fmt])","Returns expr cast to a timestamp using an optional formatting, or NULL if the cast fails."
+try_url_decode,try_url_decode(str),"Translates a string back from application/x-www-form-urlencoded format, and returns NULL in case of error."
+try_variant_get,"try_variant_get(variantExpr,path,type)","Extracts a value of type type from variantExpr, specified by path, or NULL if it is not possible to cast to the target type."
+try_zstd_decompress,try_zstd_decompress(value),"Returns value decompressed with Zstandard compression, or NULL if the input is invalid."
 typeof,typeof(expr),Return a DDL-formatted type string for the data type of expr.
 ucase,ucase(expr),Returns expr with all characters changed to uppercase.
 unbase64,unbase64(expr),Returns a decoded base64 string as binary.
@@ -426,6 +438,10 @@ uuid,uuid(),Returns an universally unique identifier (UUID) string.
 var_pop,var_pop(expr),Returns the population variance calculated from values of a group.
 var_samp,var_samp(expr),Returns the sample variance calculated from values of a group.
 variance,variance(expr),Returns the sample variance calculated from values of a group.
+variant_explode,variant_explode(variantExpr),Returns a set of rows by un-nesting variantExpr.
+variant_explode_outer,variant_explode_outer(variantExpr),Returns a set of rows by un-nesting variantExpr using outer semantics.
+variant_get,"variant_get(variantExpr,path,type)","Extracts a value of type type from variantExpr, specified by path."
+vector_search,"vector_search(index, query, num_results)",Query a Mosaic AI Vector Search index using SQL.
 version,version(),Returns the Apache Spark version.
 weekday,weekday(expr),Returns the day of the week of expr.
 weekofyear,weekofyear(expr),Returns the week of the year of expr.
@@ -444,3 +460,5 @@ xpath_string,"xpath_string(xml, xpath)",Returns the contents of the first XML no
 xxhash64,"xxhash64(expr1 [, …])",Returns a 64-bit hashed value of the arguments.
 year,year(expr),Returns the year component of expr.
 zip_with,"zip_with(expr1, expr2, func)","Merges the arrays in expr1 and expr2, element-wise, into a single array using func."
+zstd_compress ,"zstd_compress (value[,level[,streaming_mode]])",Returns value compressed with Zstandard compression.
+zstd_decompress,zstd_decompress(value),Returns value decompressed with Zstandard compression.