155 lines
4.3 KiB
Plaintext
155 lines
4.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "3307b886",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pyspark\n",
|
|
"from pyspark.sql import SparkSession\n",
|
|
"from pyspark.conf import SparkConf\n",
|
|
"from pyspark.context import SparkContext"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "9f0ddbff",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"credentials_location = '/home/alexey/.google/credentials/google_credentials.json'\n",
|
|
"\n",
|
|
"conf = SparkConf() \\\n",
|
|
" .setMaster('local[*]') \\\n",
|
|
" .setAppName('test') \\\n",
|
|
" .set(\"spark.jars\", \"./lib/gcs-connector-hadoop3-2.2.5.jar\") \\\n",
|
|
" .set(\"spark.hadoop.google.cloud.auth.service.account.enable\", \"true\") \\\n",
|
|
" .set(\"spark.hadoop.google.cloud.auth.service.account.json.keyfile\", credentials_location)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "b83404e8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"WARNING: An illegal reflective access operation has occurred\n",
|
|
"WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.0.3.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
|
|
"WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
|
|
"WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
|
|
"WARNING: All illegal access operations will be denied in a future release\n",
|
|
"22/03/30 12:25:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
|
|
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
|
|
"Setting default log level to \"WARN\".\n",
|
|
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"sc = SparkContext(conf=conf)\n",
|
|
"\n",
|
|
"hadoop_conf = sc._jsc.hadoopConfiguration()\n",
|
|
"\n",
|
|
"hadoop_conf.set(\"fs.AbstractFileSystem.gs.impl\", \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS\")\n",
|
|
"hadoop_conf.set(\"fs.gs.impl\", \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\")\n",
|
|
"hadoop_conf.set(\"fs.gs.auth.service.account.json.keyfile\", credentials_location)\n",
|
|
"hadoop_conf.set(\"fs.gs.auth.service.account.enable\", \"true\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "c4713e2b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"spark = SparkSession.builder \\\n",
|
|
" .config(conf=sc.getConf()) \\\n",
|
|
" .getOrCreate()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "1ee1eb1d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" \r"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df_green = spark.read.parquet('gs://dtc_data_lake_de-zoomcamp-nytaxi/pq/green/*/*')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "104b40ab",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" \r"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"2304517"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df_green.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f56a885d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|