我们正在构建一个数据湖,并将我们的数据块运行时从 12.2 LTS 升级到 14.3 LTS,以支持 python 3.10。我们能够写入我们的冰山表,但读取这些表时......
我们正在构建一个数据湖,并将我们的数据块运行时从 12.2 LTS 升级到 14.3 LTS,以支持 python 3.10。我们能够写入我们的冰山表,但读取这些表会出现以下阻碍,其中 Hadoop 字节缓冲区流试图读出:
错误:UnsupportedOperationException:org.apache.hadoop.fs.BufferedFSInputStream 不支持字节缓冲区读取 org.apache.spark.SparkException:由于阶段失败导致作业中止:阶段 0.0 中的任务 0 失败 4 次,最近一次失败:阶段 0.0 中的任务 0.3 丢失(TID 3)(10.139.64.20 执行器 1):java.lang.UnsupportedOperationException:org.apache.hadoop.fs.BufferedFSInputStream 不支持字节缓冲区读取
使用以下运行时配置:
Azure databricks:14.3 LTS 运行时
阿帕奇冰山:1.5.2
火花:3.5.0
scala:2.12azure-hadoop:3.4.0
有人遇到过这个问题吗?
databricks notebook 代码被简化为对我们的实际代码的测试:
#paragraph 1
pip install azure-storage-blob pymssql pymysql snowflake-connector-python
#paragraph 2
import datetime
import pymysql
import pymssql
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.types import *
import json
import re
import pandas as pd
import gzip
import shutil
from azure.storage.blob import BlobClient, ContainerClient, BlobServiceClient
from enum import Enum
# metadata info - all these vars are set in run
metadata_info = {
"workflow_id": ,
"workflow_name": "",
"Terra_version": "",
"stage_connection_name": "",
"stage_schema": "",
"host": "c", #cs.metadata_host,
"database": "", #cs.metadata_database,
"port": 3306, #cs.metadata_port,
"user_name": "", #cs.metadata_user_name,
"password": "", #cs.metadata_passwd
}
# cluster configuration
def init_spark_config(
catalog_container: str,
catalog_account: str,
catalog_sas_token: str,
catalog_name: str = "spark_catalog",
):
conf = SparkConf()
# following are needed for Spark/Hadoop/AzureBlob configuration
# example setup https://medium.com/@rvaid.29/reading-and-writing-data-to-azure-blob-storage-using-pyspark-cc8ce2fd3470
# in Databricks, these have to be set on compute cluster and appropriate jars needs to be loaded to cluster via init script or linked from maven repository
conf.set(
"spark.jars.packages",
"org.apache.hadoop:hadoop-azure:3.4.0,com.microsoft.azure:azure-storage:8.6.6,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2",
)
conf.set(
"spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
)
conf.set("spark.sql.catalogImplementation", "hive")
conf.set(
"spark.sql.catalog.spark_catalog",
"org.apache.iceberg.spark.SparkSessionCatalog",
)
conf.set(
f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog"
)
conf.set(f"spark.sql.catalog.{catalog_name}.type", "hadoop")
# seems these can be adjusted later and per session
conf.set(
f"spark.sql.catalog.{catalog_name}.warehouse",
f"wasbs://{catalog_container}@{catalog_account}.blob.core.windows.net/{catalog_name}",
)
conf.set(
f"spark.hadoop.fs.azure.sas.{catalog_container}.{catalog_account}.blob.core.windows.net",
catalog_sas_token,
)
conf.set(
f"fs.azure.sas.{catalog_container}.{catalog_account}.blob.core.windows.net",
catalog_sas_token,
)
return conf
def get_storage_info(metadata, purpose: str):
# query storage info
if purpose == 'stage':
query = f"""
select c.connection_type, c.connection_details, c.password, '{{"stage_schema":"{metadata['stage_schema']}"}}' as storage_details
from connection_details c
where c.connection_name = '{metadata['stage_connection_name']}'
"""
else:
storageDetailsField = 'w.source_details'
connectionIdField = 'connection_id'
if purpose == 'target':
storageDetailsField = 'w.target_details'
connectionIdField = 'target_connection_id'
query = f"""
select c.connection_type, c.connection_details, c.password, {storageDetailsField} as storage_details
from connection_details c
left join workflow_details w on json_value({storageDetailsField} , '$.{connectionIdField}') = c.connection_id
where w.workflow_id = {metadata['workflow_id']}
"""
conn = pymysql.connect(host = metadata['host']
,port=metadata['port']
,user = metadata['user_name']
,password = metadata['password']
,db = metadata['database']
,charset='utf8mb4'
,cursorclass=pymysql.cursors.DictCursor)
cursor = conn.cursor()
cursor.execute(query)
conn_details=json.dumps(cursor.fetchone())
conn.commit()
cursor.close()
conn.close()
return json.loads(conn_details)
metadata = metadata_info
start_time = datetime.now()
source = get_storage_info(metadata_info, purpose="source")
stage = get_storage_info(metadata_info, purpose="stage")
target = get_storage_info(metadata_info, purpose="target")
sourceAccount = json.loads(source["connection_details"])[
"Storage_Account_Name"
]
sourceContainer = json.loads(source["connection_details"])[
"Container_Name"
]
sourceType = json.loads(source["storage_details"])["file_type"]
sourcePass = json.loads(source["connection_details"])["sas_token"]
stageAccount = json.loads(stage["connection_details"])[
"Storage_Account_Name"
]
stageContainer = json.loads(stage["connection_details"])[
"Container_Name"
]
stageSchema = json.loads(stage["storage_details"])["stage_schema"]
stagePass = json.loads(stage["connection_details"])["sas_token"]
targetType = json.loads(target["storage_details"])["target_type"]
targetSchema = json.loads(target["storage_details"])["target_schema"]
targetDatabase = json.loads(target["storage_details"])[
"target_database"
]
targetServer = json.loads(target["connection_details"])["host"]
targetPort = json.loads(target["connection_details"])["port"]
conf = init_spark_config(
catalog_container=stageContainer,
catalog_account=stageAccount,
catalog_sas_token=stagePass,
catalog_name=stageSchema,
)
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# paragraph 3
data = [("Raj", "10"), ("Mitch", "20")]
# Define schema
schema = StructType([
StructField("Name", StringType(), True),
StructField("Age", StringType(), True)
])
# Create DataFrame
df = spark.createDataFrame(data, schema=schema)
# Show DataFrame
df.show()
#paragraph 4
catalog_name = 'data_vault_lake.test'
def setup_spark_catalog(
container_name: str, account_name: str, sas_token: str, catalog_name: str
):
spark.conf.set(
f"fs.azure.sas.{container_name}.{account_name}.blob.core.windows.net", sas_token
)
spark.conf.set(
f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog"
)
spark.conf.set(f"spark.sql.catalog.{catalog_name}.type", "hadoop")
spark.conf.set(
f"spark.sql.catalog.{catalog_name}.warehouse",
f"wasbs://{container_name}@{account_name}.blob.core.windows.net/{catalog_name}",
)
#paragraph 5
tableName = 'writeTest1'
df.writeTo(f"{catalog_name}.{tableName}").using("iceberg").create()
#paragraph 6
dfOut = spark.read.format("iceberg").load("data_vault_lake.test.writeTest1")
display(dfOut)
第 6 段错误:
UnsupportedOperationException: Byte-buffer read unsupported by org.apache.hadoop.fs.BufferedFSInputStream
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 11.0 failed 4 times, most recent failure: Lost task 0.3 in stage 11.0 (TID 47) (10.139.64.20 executor 1): java.lang.UnsupportedOperationException: Byte-buffer read unsupported by org.apache.hadoop.fs.BufferedFSInputStream
at org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:160)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.$anonfun$read$1(FileSystemWithMetrics.scala:77)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.withTimeAndBytesReadMetric(FileSystemWithMetrics.scala:67)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.read(FileSystemWithMetrics.scala:77)
at org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:156)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream$H2Reader.read(H2SeekableInputStream.java:82)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream.readFully(H2SeekableInputStream.java:91)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream.readFully(H2SeekableInputStream.java:76)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:584)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.<init>(ParquetFileReader.java:799)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.open(ParquetFileReader.java:666)
at org.apache.iceberg.parquet.ReadConf.newReader(ReadConf.java:238)
at org.apache.iceberg.parquet.ReadConf.<init>(ReadConf.java:81)
at org.apache.iceberg.parquet.VectorizedParquetReader.init(VectorizedParquetReader.java:90)
at org.apache.iceberg.parquet.VectorizedParquetReader.iterator(VectorizedParquetReader.java:99)
at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:109)
at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:41)
at org.apache.iceberg.spark.source.BaseReader.next(BaseReader.java:143)
at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:122)
at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:160)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1(DataSourceRDD.scala:64)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1$adapted(DataSourceRDD.scala:64)
at scala.Option.exists(Option.scala:376)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:64)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.advanceToNextIter(DataSourceRDD.scala:99)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:64)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.$anonfun$encodeUnsafeRows$5(UnsafeRowBatchUtils.scala:88)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.$anonfun$encodeUnsafeRows$3(UnsafeRowBatchUtils.scala:88)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.$anonfun$encodeUnsafeRows$1(UnsafeRowBatchUtils.scala:68)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector.$anonfun$processFunc$2(Collector.scala:214)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:82)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:82)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:201)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:186)
at org.apache.spark.scheduler.Task.$anonfun$run$5(Task.scala:151)
at com.databricks.unity.EmptyHandle$.runWithAndClose(UCSHandle.scala:129)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:145)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$9(Executor.scala:958)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:105)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:961)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:853)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:3908)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3830)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:3817)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:3817)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1695)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1680)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1680)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:4154)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:4066)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:4054)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:54)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$runJob$1(DAGScheduler.scala:1357)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1345)
at org.apache.spark.SparkContext.runJobInternal(SparkContext.scala:3000)
at org.apache.spark.sql.execution.collect.Collector.$anonfun$runSparkJobs$1(Collector.scala:355)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94)
at org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:299)
at org.apache.spark.sql.execution.collect.Collector.$anonfun$collect$1(Collector.scala:384)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94)
at org.apache.spark.sql.execution.collect.Collector.collect(Collector.scala:381)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:122)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:131)
at org.apache.spark.sql.execution.qrc.InternalRowFormat$.collect(cachedSparkResults.scala:94)
at org.apache.spark.sql.execution.qrc.InternalRowFormat$.collect(cachedSparkResults.scala:90)
at org.apache.spark.sql.execution.qrc.InternalRowFormat$.collect(cachedSparkResults.scala:78)
at org.apache.spark.sql.execution.qrc.ResultCacheManager.$anonfun$computeResult$1(ResultCacheManager.scala:546)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94)
at org.apache.spark.sql.execution.qrc.ResultCacheManager.collectResult$1(ResultCacheManager.scala:540)
at org.apache.spark.sql.execution.qrc.ResultCacheManager.computeResult(ResultCacheManager.scala:557)
at org.apache.spark.sql.execution.qrc.ResultCacheManager.$anonfun$getOrComputeResultInternal$1(ResultCacheManager.scala:400)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.execution.qrc.ResultCacheManager.getOrComputeResultInternal(ResultCacheManager.scala:400)
at org.apache.spark.sql.execution.qrc.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:318)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollectResult$1(SparkPlan.scala:558)
at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94)
at org.apache.spark.sql.execution.SparkPlan.executeCollectResult(SparkPlan.scala:555)
at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3780)
at org.apache.spark.sql.Dataset.$anonfun$collectResult$1(Dataset.scala:3771)
at org.apache.spark.sql.Dataset.$anonfun$withAction$3(Dataset.scala:4727)
at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:1103)
at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4725)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$9(SQLExecution.scala:392)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:700)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:277)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:1175)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:164)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:637)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4725)
at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:3770)
at com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation0(OutputAggregator.scala:322)
at com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation(OutputAggregator.scala:100)
at com.databricks.backend.daemon.driver.PythonDriverLocalBase.generateTableResult(PythonDriverLocalBase.scala:848)
at com.databricks.backend.daemon.driver.JupyterDriverLocal.computeListResultsItem(JupyterDriverLocal.scala:1491)
at com.databricks.backend.daemon.driver.JupyterDriverLocal$JupyterEntryPoint.addCustomDisplayData(JupyterDriverLocal.scala:286)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:397)
at py4j.Gateway.invoke(Gateway.java:306)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:199)
at py4j.ClientServerConnection.run(ClientServerConnection.java:119)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.UnsupportedOperationException: Byte-buffer read unsupported by org.apache.hadoop.fs.BufferedFSInputStream
at org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:160)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.$anonfun$read$1(FileSystemWithMetrics.scala:77)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.withTimeAndBytesReadMetric(FileSystemWithMetrics.scala:67)
at com.databricks.spark.metrics.FSInputStreamWithMetrics.read(FileSystemWithMetrics.scala:77)
at org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:156)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream$H2Reader.read(H2SeekableInputStream.java:82)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream.readFully(H2SeekableInputStream.java:91)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream.readFully(H2SeekableInputStream.java:76)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:584)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.<init>(ParquetFileReader.java:799)
at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.open(ParquetFileReader.java:666)
at org.apache.iceberg.parquet.ReadConf.newReader(ReadConf.java:238)
at org.apache.iceberg.parquet.ReadConf.<init>(ReadConf.java:81)
at org.apache.iceberg.parquet.VectorizedParquetReader.init(VectorizedParquetReader.java:90)
at org.apache.iceberg.parquet.VectorizedParquetReader.iterator(VectorizedParquetReader.java:99)
at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:109)
at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:41)
at org.apache.iceberg.spark.source.BaseReader.next(BaseReader.java:143)
at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:122)
at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:160)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1(DataSourceRDD.scala:64)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1$adapted(DataSourceRDD.scala:64)
at scala.Option.exists(Option.scala:376)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:64)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.advanceToNextIter(DataSourceRDD.scala:99)
at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:64)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.$anonfun$encodeUnsafeRows$5(UnsafeRowBatchUtils.scala:88)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.$anonfun$encodeUnsafeRows$3(UnsafeRowBatchUtils.scala:88)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.$anonfun$encodeUnsafeRows$1(UnsafeRowBatchUtils.scala:68)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector.$anonfun$processFunc$2(Collector.scala:214)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:82)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:82)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:201)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:186)
at org.apache.spark.scheduler.Task.$anonfun$run$5(Task.scala:151)
at com.databricks.unity.EmptyHandle$.runWithAndClose(UCSHandle.scala:129)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:145)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$9(Executor.scala:958)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:105)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:961)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:853)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more