-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Use Psycopg3 COPY #451
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,10 +33,11 @@ packages = [ | |
|
||
[tool.poetry.dependencies] | ||
python = ">=3.8" | ||
faker = {version = "~=30.0", optional = true} | ||
psycopg2-binary = "2.9.9" | ||
faker = {version = "~=29.0", optional = true} | ||
sqlalchemy = "~=2.0" | ||
sshtunnel = "0.4.0" | ||
psycopg = "^3.2.3" | ||
psycopg-binary = "^3.2.3" | ||
Comment on lines
+39
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason we both the source and binary packages? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @edgarrmondragon This can be changed to just psycopg. Although now I am wondering if a user wants to use psycopg[c] or psycopg[binary] what would be the suggestion? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dbt-labs/dbt-postgres#96 is probably a good case study. Most users in the data space can't or don't want to build C extensions, so we'll probably prefer |
||
|
||
[tool.poetry.dependencies.singer-sdk] | ||
version = "~=0.40.0a1" | ||
|
@@ -109,4 +110,4 @@ banned-from = ["sqlalchemy"] | |
sqlalchemy = "sa" | ||
|
||
[tool.ruff.lint.pydocstyle] | ||
convention = "google" | ||
convention = "google" | ||
SpaceCondor marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -119,6 +119,25 @@ def generate_temp_table_name(self): | |
# in postgres, used a guid just in case we are using the same session | ||
return f"{str(uuid.uuid4()).replace('-', '_')}" | ||
|
||
def generate_copy_statement( | ||
self, | ||
full_table_name: str | FullyQualifiedName, | ||
columns: list[sa.Column], # type: ignore[override] | ||
) -> str: | ||
"""Generate a copy statement for bulk copy. | ||
|
||
Args: | ||
full_table_name: the target table name. | ||
columns: the target table columns. | ||
|
||
Returns: | ||
A copy statement. | ||
""" | ||
columns_list = ", ".join(f'"{column.name}"' for column in columns) | ||
sql: str = f'COPY "{full_table_name}" ({columns_list}) FROM STDIN' | ||
|
||
return sql | ||
|
||
def bulk_insert_records( # type: ignore[override] | ||
self, | ||
table: sa.Table, | ||
|
@@ -145,35 +164,55 @@ def bulk_insert_records( # type: ignore[override] | |
True if table exists, False if not, None if unsure or undetectable. | ||
""" | ||
columns = self.column_representation(schema) | ||
insert: str = t.cast( | ||
str, | ||
self.generate_insert_statement( | ||
table.name, | ||
columns, | ||
), | ||
) | ||
self.logger.info("Inserting with SQL: %s", insert) | ||
# Only one record per PK, we want to take the last one | ||
data_to_insert: list[dict[str, t.Any]] = [] | ||
copy_statement: str = self.generate_copy_statement(table.name, columns) | ||
self.logger.info("Inserting with SQL: %s", copy_statement) | ||
|
||
data_to_copy: list[dict[str, t.Any]] = [] | ||
|
||
# If append only is False, we only take the latest record one per primary key | ||
if self.append_only is False: | ||
insert_records: dict[tuple, dict] = {} # pk tuple: record | ||
unique_copy_records: dict[tuple, dict] = {} # pk tuple: values | ||
for record in records: | ||
insert_record = { | ||
column.name: record.get(column.name) for column in columns | ||
} | ||
# No need to check for a KeyError here because the SDK already | ||
# guarantees that all key properties exist in the record. | ||
primary_key_tuple = tuple(record[key] for key in primary_keys) | ||
insert_records[primary_key_tuple] = insert_record | ||
data_to_insert = list(insert_records.values()) | ||
unique_copy_records[primary_key_tuple] = insert_record | ||
data_to_copy = list(unique_copy_records.values()) | ||
else: | ||
for record in records: | ||
insert_record = { | ||
column.name: record.get(column.name) for column in columns | ||
} | ||
data_to_insert.append(insert_record) | ||
connection.execute(insert, data_to_insert) | ||
data_to_copy.append(insert_record) | ||
|
||
# Prepare to process the rows into csv. Use each column's bind_processor to do | ||
# most of the work, then do the final construction of the csv rows ourselves | ||
# to control exactly how values are converted and which ones are quoted. | ||
column_bind_processors = { | ||
column.name: column.type.bind_processor(connection.dialect) | ||
for column in columns | ||
} | ||
|
||
# Use copy to run the copy statement. | ||
# https://www.psycopg.org/psycopg3/docs/basic/copy.html | ||
with connection.connection.cursor().copy(copy_statement) as copy: # type: ignore[attr-defined] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens at this point if someone sets There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @edgarrmondragon It would raise an exception. In the current main branch I don't think using anything aside from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not 100% sure, but I don't think we use driver-specific APIs and rely on SQLAlchemy DDL/DML in all places, so I would expect most drivers to work. Maybe I'm wrong. |
||
for row in data_to_copy: | ||
processed_row = [] | ||
for row_column_name in row: | ||
if column_bind_processors[row_column_name] is not None: | ||
processed_row.append( | ||
column_bind_processors[row_column_name]( | ||
row[row_column_name] | ||
) | ||
) | ||
else: | ||
processed_row.append(row[row_column_name]) | ||
|
||
copy.write_row(processed_row) | ||
|
||
return True | ||
|
||
def upsert( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
|
||
def postgres_config(): | ||
return { | ||
"dialect+driver": "postgresql+psycopg2", | ||
"dialect+driver": "postgresql+psycopg", | ||
"host": "localhost", | ||
"user": "postgres", | ||
"password": "postgres", | ||
|
@@ -29,7 +29,7 @@ def postgres_config(): | |
|
||
def postgres_config_no_ssl(): | ||
return { | ||
"dialect+driver": "postgresql+psycopg2", | ||
"dialect+driver": "postgresql+psycopg", | ||
"host": "localhost", | ||
"user": "postgres", | ||
"password": "postgres", | ||
|
@@ -43,7 +43,7 @@ def postgres_config_no_ssl(): | |
|
||
def postgres_config_ssh_tunnel(): | ||
return { | ||
"sqlalchemy_url": "postgresql://postgres:[email protected]:5432/main", | ||
"sqlalchemy_url": "postgresql+psycopg://postgres:[email protected]:5432/main", | ||
"ssh_tunnel": { | ||
"enable": True, | ||
"host": "127.0.0.1", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any reason to downgrade this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope! My branch was slightly outdated 😅