-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HIVE-28600: Iceberg: Check that table/partition requires compaction b…
…efore compacting
- Loading branch information
Dmitriy Fingerman
committed
Nov 6, 2024
1 parent
18f34e7
commit 485cb6f
Showing
17 changed files
with
464 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
228 changes: 228 additions & 0 deletions
228
...g/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergCompaction.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.iceberg.mr.hive; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import org.apache.hadoop.hive.conf.HiveConf; | ||
import org.apache.hadoop.hive.metastore.api.CompactionType; | ||
import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; | ||
import org.apache.iceberg.FileFormat; | ||
import org.apache.iceberg.PartitionSpec; | ||
import org.apache.iceberg.Table; | ||
import org.apache.iceberg.catalog.TableIdentifier; | ||
import org.apache.iceberg.data.Record; | ||
import org.apache.iceberg.mr.TestHelper; | ||
import org.junit.After; | ||
import org.junit.AfterClass; | ||
import org.junit.Assert; | ||
import org.junit.Before; | ||
import org.junit.BeforeClass; | ||
import org.junit.Rule; | ||
import org.junit.Test; | ||
import org.junit.rules.TemporaryFolder; | ||
|
||
import static org.junit.Assert.assertFalse; | ||
import static org.junit.Assert.assertTrue; | ||
import static org.junit.Assert.fail; | ||
|
||
public class TestHiveIcebergCompaction { | ||
|
||
private static TestHiveShell shell; | ||
private TestTables testTables; | ||
@Rule | ||
public TemporaryFolder temp = new TemporaryFolder(); | ||
|
||
static final List<Record> CUSTOMER_RECORDS_1 = TestHelper.RecordsBuilder.newInstance( | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.add(0L, "Alice", "Brown") | ||
.add(1L, "Bob", "Green") | ||
.build(); | ||
|
||
static final List<Record> CUSTOMER_RECORDS_2 = TestHelper.RecordsBuilder.newInstance( | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.add(2L, "Bruce", "Brown") | ||
.add(3L, "Trudy", "Green") | ||
.add(4L, "Alex", "Pink") | ||
.build(); | ||
|
||
static final List<Record> CUSTOMER_RECORDS_3 = TestHelper.RecordsBuilder.newInstance( | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.add(5L, "Bruce", "Blue") | ||
.add(6L, "Trudy", "Blue") | ||
.build(); | ||
|
||
static final List<Record> CUSTOMER_RECORDS_4 = TestHelper.RecordsBuilder.newInstance( | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.add(0L, "Alice", "Brown").build(); | ||
|
||
static final List<Record> CUSTOMER_RECORDS_5 = TestHelper.RecordsBuilder.newInstance( | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.add(2L, "Bruce", "Brown") | ||
.build(); | ||
|
||
@BeforeClass | ||
public static void beforeClass() { | ||
shell = HiveIcebergStorageHandlerTestUtils.shell(); | ||
} | ||
|
||
@AfterClass | ||
public static void afterClass() throws Exception { | ||
shell.stop(); | ||
} | ||
|
||
@Before | ||
public void before() throws IOException { | ||
testTables = HiveIcebergStorageHandlerTestUtils.testTables(shell, TestTables.TestTableType.HIVE_CATALOG, temp); | ||
HiveIcebergStorageHandlerTestUtils.init(shell, testTables, temp, "tez"); | ||
HiveConf.setBoolVar(shell.getHiveConf(), HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false); | ||
} | ||
|
||
@After | ||
public void after() throws Exception { | ||
HiveIcebergStorageHandlerTestUtils.close(shell); | ||
ExecMapper.setDone(false); | ||
} | ||
|
||
@Test | ||
public void testCanCompactPartitioned() { | ||
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.identity("last_name").build(); | ||
|
||
Table table = testTables.createTable(shell, "customers", | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, FileFormat.ORC, CUSTOMER_RECORDS_1, 2); | ||
|
||
shell.executeStatement(testTables.getInsertQuery(CUSTOMER_RECORDS_2, | ||
TableIdentifier.of("default", "customers"), false)); | ||
|
||
shell.executeStatement("DELETE FROM customers WHERE customer_id=3"); | ||
|
||
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id"); | ||
Assert.assertEquals(4, objects.size()); | ||
List<Record> expected = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) | ||
.add(0L, "Alice", "Brown") | ||
.add(1L, "Bob", "Green") | ||
.add(2L, "Bruce", "Brown") | ||
.add(4L, "Alex", "Pink") | ||
.build(); | ||
HiveIcebergTestUtils.validateData(expected, | ||
HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, objects), 0); | ||
|
||
HiveConf conf = new HiveConf(); | ||
conf.setIntVar(HiveConf.ConfVars.ICEBERG_COMPACTION_DELETE_RECORDS_THRESHOLD, 1); | ||
HiveIcebergStorageHandler storageHandler = new HiveIcebergStorageHandler(); | ||
storageHandler.setConf(conf); | ||
table.refresh(); | ||
|
||
try { | ||
/* | ||
* Partition: 'last_name=Brown'. | ||
* 2 data files, 0 delete files. | ||
* Existing data size of the partition: 955 bytes | ||
* 1st file size in bytes: 479 | ||
* 2nd files size in bytes: 476 | ||
*/ | ||
|
||
// Does not need compaction because ratio of uncompacted/compacted file sizes = 0%, all files compacted. | ||
conf.set(HiveConf.ConfVars.HIVE_ICEBERG_MAJOR_COMPACTION_FILE_SIZE_THRESHOLD.varname, "10bytes"); | ||
assertFalse(storageHandler.canCompact(conf, table, "last_name=Brown", CompactionType.MAJOR)); | ||
|
||
// Needs compaction because ratio of uncompacted/compacted file sizes = 50%, above allowed threshold of 10%. | ||
conf.set(HiveConf.ConfVars.HIVE_ICEBERG_MAJOR_COMPACTION_FILE_SIZE_THRESHOLD.varname, "477bytes"); | ||
assertTrue(storageHandler.canCompact(conf, table, "last_name=Brown", CompactionType.MAJOR)); | ||
|
||
// Needs compaction because ratio of uncompacted/compacted file sizes = 100%, all files uncompacted. | ||
conf.set(HiveConf.ConfVars.HIVE_ICEBERG_MAJOR_COMPACTION_FILE_SIZE_THRESHOLD.varname, "1Mb"); | ||
assertTrue(storageHandler.canCompact(conf, table, "last_name=Brown", CompactionType.MAJOR)); | ||
|
||
// Delete records count is below threshold and only 1 data file, cannot compact. | ||
assertFalse(storageHandler.canCompact(conf, table, "last_name=Green", CompactionType.MAJOR)); | ||
|
||
// No delete files, only one data file, cannot compact. | ||
assertFalse(storageHandler.canCompact(conf, table, "last_name=Pink", CompactionType.MAJOR)); | ||
|
||
// Needs compaction because ratio of deleted records (2) exceeds threshold (1) | ||
shell.executeStatement(testTables.getInsertQuery(CUSTOMER_RECORDS_3, | ||
TableIdentifier.of("default", "customers"), false)); | ||
shell.executeStatement("DELETE FROM customers WHERE customer_id=5"); | ||
shell.executeStatement("DELETE FROM customers WHERE customer_id=6"); | ||
table.refresh(); | ||
assertTrue(storageHandler.canCompact(conf, table, "last_name=Blue", CompactionType.MAJOR)); | ||
} catch (Exception e) { | ||
fail("Exception is unexpected here"); | ||
} | ||
} | ||
|
||
@Test | ||
public void testCanCompactUnpartitioned() { | ||
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).build(); | ||
|
||
Table table = testTables.createTable(shell, "customers", | ||
HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, FileFormat.ORC, null, 2); | ||
|
||
HiveConf conf = new HiveConf(); | ||
conf.setIntVar(HiveConf.ConfVars.ICEBERG_COMPACTION_DELETE_RECORDS_THRESHOLD, 1); | ||
HiveIcebergStorageHandler storageHandler = new HiveIcebergStorageHandler(); | ||
storageHandler.setConf(conf); | ||
|
||
try { | ||
// Zero data/delete files - cannot compact | ||
assertFalse(storageHandler.canCompact(conf, table, null, CompactionType.MAJOR)); | ||
|
||
shell.executeStatement(testTables.getInsertQuery(CUSTOMER_RECORDS_4, | ||
TableIdentifier.of("default", "customers"), false)); | ||
shell.executeStatement(testTables.getInsertQuery(CUSTOMER_RECORDS_5, | ||
TableIdentifier.of("default", "customers"), false)); | ||
table.refresh(); | ||
|
||
/* | ||
* 2 data files, 0 delete files. | ||
* Existing data size of the partition: 955 bytes | ||
* 1st file size in bytes: 479 | ||
* 2nd files size in bytes: 476 | ||
*/ | ||
|
||
// Does not need compaction because ratio of uncompacted/compacted file sizes = 0%, all files compacted. | ||
conf.set(HiveConf.ConfVars.HIVE_ICEBERG_MAJOR_COMPACTION_FILE_SIZE_THRESHOLD.varname, "10bytes"); | ||
assertFalse(storageHandler.canCompact(conf, table, null, CompactionType.MAJOR)); | ||
|
||
// Needs compaction because ratio of uncompacted/compacted file sizes = 50%, above allowed threshold of 10%. | ||
conf.set(HiveConf.ConfVars.HIVE_ICEBERG_MAJOR_COMPACTION_FILE_SIZE_THRESHOLD.varname, "477bytes"); | ||
assertTrue(storageHandler.canCompact(conf, table, null, CompactionType.MAJOR)); | ||
|
||
// Needs compaction because ratio of uncompacted/compacted file sizes = 100%, all files uncompacted. | ||
conf.set(HiveConf.ConfVars.HIVE_ICEBERG_MAJOR_COMPACTION_FILE_SIZE_THRESHOLD.varname, "1Mb"); | ||
assertTrue(storageHandler.canCompact(conf, table, null, CompactionType.MAJOR)); | ||
|
||
shell.executeStatement(testTables.getInsertQuery(CUSTOMER_RECORDS_1, | ||
TableIdentifier.of("default", "customers"), false)); | ||
shell.executeStatement(testTables.getInsertQuery(CUSTOMER_RECORDS_2, | ||
TableIdentifier.of("default", "customers"), false)); | ||
shell.executeStatement("DELETE FROM customers where customer_id=0"); | ||
shell.executeStatement("DELETE FROM customers where customer_id=1"); | ||
table.refresh(); | ||
|
||
// Needs compaction because ratio of deleted records (3) exceeds threshold (1) | ||
assertTrue(storageHandler.canCompact(conf, table, null, CompactionType.MAJOR)); | ||
} catch (Exception e) { | ||
fail("Exception is unexpected here"); | ||
} | ||
} | ||
} |
Oops, something went wrong.