Files
Alexey Grigorev 3eadd01037 code for week 5
2022-02-16 22:21:35 +00:00

268 lines
7.7 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "72505747",
"metadata": {},
"outputs": [],
"source": [
"import pyspark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "bd55afbe",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/python/pyspark/__init__.py'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pyspark.__file__"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "29f1cf4c",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "cf6d80ad",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: An illegal reflective access operation has occurred\n",
"WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.0.3.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
"WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
"WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
"WARNING: All illegal access operations will be denied in a future release\n",
"22/02/15 22:22:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
]
}
],
"source": [
"spark = SparkSession.builder \\\n",
" .master(\"local[*]\") \\\n",
" .appName('test') \\\n",
" .getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3f604529",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2022-02-15 22:23:22-- https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.196.8\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.196.8|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 12322 (12K) [application/octet-stream]\n",
"Saving to: taxi+_zone_lookup.csv\n",
"\n",
"taxi+_zone_lookup.c 100%[===================>] 12.03K --.-KB/s in 0s \n",
"\n",
"2022-02-15 22:23:23 (114 MB/s) - taxi+_zone_lookup.csv saved [12322/12322]\n",
"\n"
]
}
],
"source": [
"!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "12342345",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r",
"\r\n",
"1,\"EWR\",\"Newark Airport\",\"EWR\"\r",
"\r\n",
"2,\"Queens\",\"Jamaica Bay\",\"Boro Zone\"\r",
"\r\n",
"3,\"Bronx\",\"Allerton/Pelham Gardens\",\"Boro Zone\"\r",
"\r\n",
"4,\"Manhattan\",\"Alphabet City\",\"Yellow Zone\"\r",
"\r\n",
"5,\"Staten Island\",\"Arden Heights\",\"Boro Zone\"\r",
"\r\n",
"6,\"Staten Island\",\"Arrochar/Fort Wadsworth\",\"Boro Zone\"\r",
"\r\n",
"7,\"Queens\",\"Astoria\",\"Boro Zone\"\r",
"\r\n",
"8,\"Queens\",\"Astoria Park\",\"Boro Zone\"\r",
"\r\n",
"9,\"Queens\",\"Auburndale\",\"Boro Zone\"\r",
"\r\n"
]
}
],
"source": [
"!head taxi+_zone_lookup.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "809464d0",
"metadata": {},
"outputs": [],
"source": [
"df = spark.read \\\n",
" .option(\"header\", \"true\") \\\n",
" .csv('taxi+_zone_lookup.csv')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e36dd996",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+-------------+--------------------+------------+\n",
"|LocationID| Borough| Zone|service_zone|\n",
"+----------+-------------+--------------------+------------+\n",
"| 1| EWR| Newark Airport| EWR|\n",
"| 2| Queens| Jamaica Bay| Boro Zone|\n",
"| 3| Bronx|Allerton/Pelham G...| Boro Zone|\n",
"| 4| Manhattan| Alphabet City| Yellow Zone|\n",
"| 5|Staten Island| Arden Heights| Boro Zone|\n",
"| 6|Staten Island|Arrochar/Fort Wad...| Boro Zone|\n",
"| 7| Queens| Astoria| Boro Zone|\n",
"| 8| Queens| Astoria Park| Boro Zone|\n",
"| 9| Queens| Auburndale| Boro Zone|\n",
"| 10| Queens| Baisley Park| Boro Zone|\n",
"| 11| Brooklyn| Bath Beach| Boro Zone|\n",
"| 12| Manhattan| Battery Park| Yellow Zone|\n",
"| 13| Manhattan| Battery Park City| Yellow Zone|\n",
"| 14| Brooklyn| Bay Ridge| Boro Zone|\n",
"| 15| Queens|Bay Terrace/Fort ...| Boro Zone|\n",
"| 16| Queens| Bayside| Boro Zone|\n",
"| 17| Brooklyn| Bedford| Boro Zone|\n",
"| 18| Bronx| Bedford Park| Boro Zone|\n",
"| 19| Queens| Bellerose| Boro Zone|\n",
"| 20| Bronx| Belmont| Boro Zone|\n",
"+----------+-------------+--------------------+------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "cb547351",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
"[Stage 4:> (0 + 1) / 1]\r",
"\r",
" \r"
]
}
],
"source": [
"df.write.parquet('zones')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "02fe2bdb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 28K\r\n",
"-rw-rw-r-- 1 alexey alexey 6.8K Feb 15 22:25 Untitled.ipynb\r\n",
"-rw-rw-r-- 1 alexey alexey 13K Aug 17 2016 taxi+_zone_lookup.csv\r\n",
"drwxr-xr-x 2 alexey alexey 4.0K Feb 15 22:25 zones\r\n"
]
}
],
"source": [
"!ls -lh"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "659f0812",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}