268 lines
7.7 KiB
Plaintext
268 lines
7.7 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "72505747",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pyspark"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "bd55afbe",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/python/pyspark/__init__.py'"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pyspark.__file__"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "29f1cf4c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from pyspark.sql import SparkSession"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "cf6d80ad",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"WARNING: An illegal reflective access operation has occurred\n",
|
||
"WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.0.3.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
|
||
"WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
|
||
"WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
|
||
"WARNING: All illegal access operations will be denied in a future release\n",
|
||
"22/02/15 22:22:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
|
||
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
|
||
"Setting default log level to \"WARN\".\n",
|
||
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"spark = SparkSession.builder \\\n",
|
||
" .master(\"local[*]\") \\\n",
|
||
" .appName('test') \\\n",
|
||
" .getOrCreate()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "3f604529",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"--2022-02-15 22:23:22-- https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv\n",
|
||
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.196.8\n",
|
||
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.196.8|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 12322 (12K) [application/octet-stream]\n",
|
||
"Saving to: ‘taxi+_zone_lookup.csv’\n",
|
||
"\n",
|
||
"taxi+_zone_lookup.c 100%[===================>] 12.03K --.-KB/s in 0s \n",
|
||
"\n",
|
||
"2022-02-15 22:23:23 (114 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "12342345",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r",
|
||
"\r\n",
|
||
"1,\"EWR\",\"Newark Airport\",\"EWR\"\r",
|
||
"\r\n",
|
||
"2,\"Queens\",\"Jamaica Bay\",\"Boro Zone\"\r",
|
||
"\r\n",
|
||
"3,\"Bronx\",\"Allerton/Pelham Gardens\",\"Boro Zone\"\r",
|
||
"\r\n",
|
||
"4,\"Manhattan\",\"Alphabet City\",\"Yellow Zone\"\r",
|
||
"\r\n",
|
||
"5,\"Staten Island\",\"Arden Heights\",\"Boro Zone\"\r",
|
||
"\r\n",
|
||
"6,\"Staten Island\",\"Arrochar/Fort Wadsworth\",\"Boro Zone\"\r",
|
||
"\r\n",
|
||
"7,\"Queens\",\"Astoria\",\"Boro Zone\"\r",
|
||
"\r\n",
|
||
"8,\"Queens\",\"Astoria Park\",\"Boro Zone\"\r",
|
||
"\r\n",
|
||
"9,\"Queens\",\"Auburndale\",\"Boro Zone\"\r",
|
||
"\r\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!head taxi+_zone_lookup.csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "809464d0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = spark.read \\\n",
|
||
" .option(\"header\", \"true\") \\\n",
|
||
" .csv('taxi+_zone_lookup.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "e36dd996",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"+----------+-------------+--------------------+------------+\n",
|
||
"|LocationID| Borough| Zone|service_zone|\n",
|
||
"+----------+-------------+--------------------+------------+\n",
|
||
"| 1| EWR| Newark Airport| EWR|\n",
|
||
"| 2| Queens| Jamaica Bay| Boro Zone|\n",
|
||
"| 3| Bronx|Allerton/Pelham G...| Boro Zone|\n",
|
||
"| 4| Manhattan| Alphabet City| Yellow Zone|\n",
|
||
"| 5|Staten Island| Arden Heights| Boro Zone|\n",
|
||
"| 6|Staten Island|Arrochar/Fort Wad...| Boro Zone|\n",
|
||
"| 7| Queens| Astoria| Boro Zone|\n",
|
||
"| 8| Queens| Astoria Park| Boro Zone|\n",
|
||
"| 9| Queens| Auburndale| Boro Zone|\n",
|
||
"| 10| Queens| Baisley Park| Boro Zone|\n",
|
||
"| 11| Brooklyn| Bath Beach| Boro Zone|\n",
|
||
"| 12| Manhattan| Battery Park| Yellow Zone|\n",
|
||
"| 13| Manhattan| Battery Park City| Yellow Zone|\n",
|
||
"| 14| Brooklyn| Bay Ridge| Boro Zone|\n",
|
||
"| 15| Queens|Bay Terrace/Fort ...| Boro Zone|\n",
|
||
"| 16| Queens| Bayside| Boro Zone|\n",
|
||
"| 17| Brooklyn| Bedford| Boro Zone|\n",
|
||
"| 18| Bronx| Bedford Park| Boro Zone|\n",
|
||
"| 19| Queens| Bellerose| Boro Zone|\n",
|
||
"| 20| Bronx| Belmont| Boro Zone|\n",
|
||
"+----------+-------------+--------------------+------------+\n",
|
||
"only showing top 20 rows\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "cb547351",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\r",
|
||
"[Stage 4:> (0 + 1) / 1]\r",
|
||
"\r",
|
||
" \r"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"df.write.parquet('zones')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "02fe2bdb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"total 28K\r\n",
|
||
"-rw-rw-r-- 1 alexey alexey 6.8K Feb 15 22:25 Untitled.ipynb\r\n",
|
||
"-rw-rw-r-- 1 alexey alexey 13K Aug 17 2016 taxi+_zone_lookup.csv\r\n",
|
||
"drwxr-xr-x 2 alexey alexey 4.0K Feb 15 22:25 zones\r\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!ls -lh"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "659f0812",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|