{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "72505747", "metadata": {}, "outputs": [], "source": [ "import pyspark" ] }, { "cell_type": "code", "execution_count": 3, "id": "bd55afbe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/python/pyspark/__init__.py'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pyspark.__file__" ] }, { "cell_type": "code", "execution_count": 4, "id": "29f1cf4c", "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession" ] }, { "cell_type": "code", "execution_count": 5, "id": "cf6d80ad", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: An illegal reflective access operation has occurred\n", "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/alexey/spark/spark-3.0.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.0.3.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", "WARNING: All illegal access operations will be denied in a future release\n", "22/02/15 22:22:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] } ], "source": [ "spark = SparkSession.builder \\\n", " .master(\"local[*]\") \\\n", " .appName('test') \\\n", " .getOrCreate()" ] }, { "cell_type": "code", "execution_count": 6, "id": "3f604529", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-02-15 22:23:22-- https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv\n", "Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.196.8\n", "Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.196.8|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12322 (12K) [application/octet-stream]\n", "Saving to: ‘taxi+_zone_lookup.csv’\n", "\n", "taxi+_zone_lookup.c 100%[===================>] 12.03K --.-KB/s in 0s \n", "\n", "2022-02-15 22:23:23 (114 MB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]\n", "\n" ] } ], "source": [ "!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv" ] }, { "cell_type": "code", "execution_count": 7, "id": "12342345", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r", "\r\n", "1,\"EWR\",\"Newark Airport\",\"EWR\"\r", "\r\n", "2,\"Queens\",\"Jamaica Bay\",\"Boro Zone\"\r", "\r\n", "3,\"Bronx\",\"Allerton/Pelham Gardens\",\"Boro Zone\"\r", "\r\n", "4,\"Manhattan\",\"Alphabet City\",\"Yellow Zone\"\r", "\r\n", "5,\"Staten Island\",\"Arden Heights\",\"Boro Zone\"\r", "\r\n", "6,\"Staten Island\",\"Arrochar/Fort Wadsworth\",\"Boro Zone\"\r", "\r\n", "7,\"Queens\",\"Astoria\",\"Boro Zone\"\r", "\r\n", "8,\"Queens\",\"Astoria Park\",\"Boro Zone\"\r", "\r\n", "9,\"Queens\",\"Auburndale\",\"Boro Zone\"\r", "\r\n" ] } ], "source": [ "!head taxi+_zone_lookup.csv" ] }, { "cell_type": "code", "execution_count": 10, "id": "809464d0", "metadata": {}, "outputs": [], "source": [ "df = spark.read \\\n", " .option(\"header\", \"true\") \\\n", " .csv('taxi+_zone_lookup.csv')" ] }, { "cell_type": "code", "execution_count": 11, "id": "e36dd996", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------+-------------+--------------------+------------+\n", "|LocationID| Borough| Zone|service_zone|\n", "+----------+-------------+--------------------+------------+\n", "| 1| EWR| Newark Airport| EWR|\n", "| 2| Queens| Jamaica Bay| Boro Zone|\n", "| 3| Bronx|Allerton/Pelham G...| Boro Zone|\n", "| 4| Manhattan| Alphabet City| Yellow Zone|\n", "| 5|Staten Island| Arden Heights| Boro Zone|\n", "| 6|Staten Island|Arrochar/Fort Wad...| Boro Zone|\n", "| 7| Queens| Astoria| Boro Zone|\n", "| 8| Queens| Astoria Park| Boro Zone|\n", "| 9| Queens| Auburndale| Boro Zone|\n", "| 10| Queens| Baisley Park| Boro Zone|\n", "| 11| Brooklyn| Bath Beach| Boro Zone|\n", "| 12| Manhattan| Battery Park| Yellow Zone|\n", "| 13| Manhattan| Battery Park City| Yellow Zone|\n", "| 14| Brooklyn| Bay Ridge| Boro Zone|\n", "| 15| Queens|Bay Terrace/Fort ...| Boro Zone|\n", "| 16| Queens| Bayside| Boro Zone|\n", "| 17| Brooklyn| Bedford| Boro Zone|\n", "| 18| Bronx| Bedford Park| Boro Zone|\n", "| 19| Queens| Bellerose| Boro Zone|\n", "| 20| Bronx| Belmont| Boro Zone|\n", "+----------+-------------+--------------------+------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df.show()" ] }, { "cell_type": "code", "execution_count": 12, "id": "cb547351", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\r", "[Stage 4:> (0 + 1) / 1]\r", "\r", " \r" ] } ], "source": [ "df.write.parquet('zones')" ] }, { "cell_type": "code", "execution_count": 14, "id": "02fe2bdb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 28K\r\n", "-rw-rw-r-- 1 alexey alexey 6.8K Feb 15 22:25 Untitled.ipynb\r\n", "-rw-rw-r-- 1 alexey alexey 13K Aug 17 2016 taxi+_zone_lookup.csv\r\n", "drwxr-xr-x 2 alexey alexey 4.0K Feb 15 22:25 zones\r\n" ] } ], "source": [ "!ls -lh" ] }, { "cell_type": "code", "execution_count": null, "id": "659f0812", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }