!pip install pyspark==3.3.1 py4j==0.10.9.5

Collecting pyspark==3.3.1
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 281.4/281.4 MB 4.3 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.7/199.7 kB 10.0 MB/s eta 0:00:00
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... done
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845494 sha256=c19df499e9516fd46ec606aaf56c06a396aada6a7de75a905ee68ba89b2586c1
  Stored in directory: /root/.cache/pip/wheels/0f/f0/3d/517368b8ce80486e84f89f214e0a022554e4ee64969f46279b
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
    Uninstalling py4j-0.10.9.7:
      Successfully uninstalled py4j-0.10.9.7
Successfully installed py4j-0.10.9.5 pyspark-3.3.1



from pyspark.sql import SparkSession
from pyspark import SparkConf

# SparkConf 클래스는 Apache Spark 애플리케이션의 구성(configuration)을 설정하기 위한 클래스
conf = SparkConf()
conf.set("spark.app.name", "PySpark DataFrame #3")
conf.set("spark.master", "local[*]")

spark = SparkSession.builder\
        .config(conf=conf)\
        .getOrCreate()



!wget https://s3-geospatial.s3.us-west-2.amazonaws.com/transfer_cost.txt

--2024-01-23 04:53:37--  https://s3-geospatial.s3.us-west-2.amazonaws.com/transfer_cost.txt
Resolving s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)... 52.92.233.10, 52.92.177.2, 52.92.138.18, ...
Connecting to s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)|52.92.233.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 286779 (280K) [text/plain]
Saving to: ‘transfer_cost.txt’

transfer_cost.txt   100%[===================>] 280.06K  1.71MB/s    in 0.2s    

2024-01-23 04:53:37 (1.71 MB/s) - ‘transfer_cost.txt’ saved [286779/286779]



!ls -tl

total 288
drwxr-xr-x 1 root root   4096 Jan 19 14:20 sample_data
-rw-r--r-- 1 root root 286779 Apr 24  2022 transfer_cost.txt



!head -5 transfer_cost.txt

On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling
On 2021-01-04 the cost per ton from 85001 to 85004 is $25.68 at ABC Hauling
On 2021-01-04 the cost per ton from 85001 to 85007 is 19.86 at ABC Hauling
On 2021-01-04 the cost per ton from 85001 to 85007 is 20.52 at Haul Today
On 2021-01-04 the cost per ton from 85001 to 85010 is 20.72 at Haul Today



import pyspark.sql.functions as F
from pyspark.sql.types import *

# StructType을 사용하여 구조체(스키마)를 정의합니다. 여기서는 "text"라는 하나의
# 열을 가지고 있으며, 이 열의 데이터 유형은 StringType()이며, 널 값을 허용합니다(True).
schema = StructType([ StructField("text", StringType(), True)])

# "transfer_cost.txt" 파일을 읽어와서 스키마를 적용한 DataFrame 생성
transfer_cost_df = spark.read.schema(schema).text("transfer_cost.txt")



# PySpark DataFrame인 transfer_cost_df의 내용을 출력하는 명령
transfer_cost_df.show(truncate=False)

+---------------------------------------------------------------------------+
|text                                                                       |
+---------------------------------------------------------------------------+
|On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85004 is $25.68 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85007 is 19.86 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85007 is 20.52 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85010 is 20.72 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85012 is $18.98 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85013 is 26.64 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85020 is 26.34 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85021 is $20.15 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85002 to 85001 is 21.57 at Haul Today  |
|On 2021-01-04 the cost per ton from 85002 to 85004 is 21.40 at Haul Today  |
|On 2021-01-04 the cost per ton from 85002 to 85007 is 25.93 at Haul Today  |
|On 2021-01-04 the cost per ton from 85002 to 85010 is 19.80 at Haul Today  |
|On 2021-01-04 the cost per ton from 85002 to 85012 is 21.66 at Haul Today  |
|On 2021-01-04 the cost per ton from 85002 to 85013 is $25.90 at Haul Today |
|On 2021-01-04 the cost per ton from 85002 to 85020 is 19.15 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85002 to 85021 is $27.13 at Haul Today |
|On 2021-01-04 the cost per ton from 85004 to 85001 is 23.88 at Haul Today  |
|On 2021-01-04 the cost per ton from 85004 to 85002 is 26.40 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85004 to 85007 is 26.03 at ABC Hauling |
+---------------------------------------------------------------------------+
only showing top 20 rows



# PySpark의 함수와 모듈을 불러옵니다
from pyspark.sql.functions import *

# 정규식 패턴을 정의합니다.
regex_str = r'On (\S+) the cost per ton from (\d+) to (\d+) is (\S+) at (.*)'

# 각 열을 새로운 열로 추가하면서 정보를 추출합니다.
# 'week' 열: "On (\S+)"에서 날짜 정보를 추출합니다.
# 'departure_zipcode' 열: "from (\d+)"에서 출발 우편번호를 추출합니다.
# 'arrival_zipcode' 열: "to (\d+)"에서 도착 우편번호를 추출합니다.
# 'cost' 열: "is (\S+)"에서 운송 비용을 추출합니다.
# 'vendor' 열: "at (.*)"에서 운송 업체(vendor) 정보를 추출합니다.
df_with_new_columns = transfer_cost_df\
    .withColumn('week', regexp_extract('text', regex_str, 1))\
    .withColumn('departure_zipcode', regexp_extract(column('text'), regex_str, 2))\
    .withColumn('arrival_zipcode', regexp_extract(transfer_cost_df.text, regex_str, 3))\
    .withColumn('cost', regexp_extract(col('text'), regex_str, 4))\
    .withColumn('vendor', regexp_extract(col('text'), regex_str, 5))



df_with_new_columns.printSchema()

root
 |-- text: string (nullable = true)
 |-- week: string (nullable = true)
 |-- departure_zipcode: string (nullable = true)
 |-- arrival_zipcode: string (nullable = true)
 |-- cost: string (nullable = true)
 |-- vendor: string (nullable = true)



# text" 열이 제거된 새로운 D.drop("text")ataFrame
final_df = df_with_new_columns



final_df.write.csv("extracted.csv")



!ls -tl

total 292
drwxr-xr-x 2 root root   4096 Jan 23 04:55 extracted.csv
drwxr-xr-x 1 root root   4096 Jan 19 14:20 sample_data
-rw-r--r-- 1 root root 286779 Apr 24  2022 transfer_cost.txt



!ls -tl extracted.csv/

total 156
-rw-r--r-- 1 root root      0 Jan 23 04:55 _SUCCESS
-rw-r--r-- 1 root root 156423 Jan 23 04:55 part-00000-c19163dc-d8a7-488f-ac25-58c2b14105f9-c000.csv



!head -5 extracted.csv/part-00000-c19163dc-d8a7-488f-ac25-58c2b14105f9-c000.csv

2021-01-04,85001,85002,$28.32,ABC Hauling
2021-01-04,85001,85004,$25.68,ABC Hauling
2021-01-04,85001,85007,19.86,ABC Hauling
2021-01-04,85001,85007,20.52,Haul Today
2021-01-04,85001,85010,20.72,Haul Today



# DataFrame의 데이터를 어떻게 저장할지를 지정하는 메서드
final_df.write.format("json").save("extracted.json")

---------------------------------------------------------------------------
AnalysisException                         Traceback (most recent call last)
<ipython-input-22-ba2b7f373793> in <cell line: 2>()
      1 # DataFrame의 데이터를 어떻게 저장할지를 지정하는 메서드
----> 2 final_df.write.format("json").save("extracted.json")

/usr/local/lib/python3.10/dist-packages/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
    966             self._jwrite.save()
    967         else:
--> 968             self._jwrite.save(path)
    969 
    970     @since(1.4)

/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py in __call__(self, *args)
   1319 
   1320         answer = self.gateway_client.send_command(command)
-> 1321         return_value = get_return_value(
   1322             answer, self.gateway_client, self.target_id, self.name)
   1323 

/usr/local/lib/python3.10/dist-packages/pyspark/sql/utils.py in deco(*a, **kw)
    194                 # Hide where the exception came from that shows a non-Pythonic
    195                 # JVM exception message.
--> 196                 raise converted from None
    197             else:
    198                 raise

AnalysisException: path file:/content/extracted.json already exists.



!ls -tl extracted.json/

total 428
-rw-r--r-- 1 root root      0 Jan 23 05:25 _SUCCESS
-rw-r--r-- 1 root root 436305 Jan 23 05:25 part-00000-ae03f8cc-5b87-46ca-8541-e3fbcc533bd1-c000.json



!head -1 extracted.json/part-00000-ae03f8cc-5b87-46ca-8541-e3fbcc533bd1-c000.json

{"week":"2021-01-04","departure_zipcode":"85001","arrival_zipcode":"85002","cost":"$28.32","vendor":"ABC Hauling"}


!pip install pyspark==3.3.1 py4j==0.10.9.5

Collecting pyspark==3.3.1
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 281.4/281.4 MB 3.8 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.7/199.7 kB 14.7 MB/s eta 0:00:00
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... done
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845494 sha256=5a8a95a4b64eb993cbb5caab35f11bf06f0a79a54ae99e16d71578465d2ec74f
  Stored in directory: /root/.cache/pip/wheels/0f/f0/3d/517368b8ce80486e84f89f214e0a022554e4ee64969f46279b
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
    Uninstalling py4j-0.10.9.7:
      Successfully uninstalled py4j-0.10.9.7
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


!wget https://s3-geospatial.s3-us-west-2.amazonaws.com/1800.csv

--2024-01-22 13:52:02--  https://s3-geospatial.s3-us-west-2.amazonaws.com/1800.csv
Resolving s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)... 3.5.77.102, 52.218.180.65, 52.92.161.162, ...
Connecting to s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)|3.5.77.102|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 62728 (61K) [text/csv]
Saving to: ‘1800.csv’

1800.csv            100%[===================>]  61.26K  --.-KB/s    in 0.1s    

2024-01-22 13:52:03 (442 KB/s) - ‘1800.csv’ saved [62728/62728]


!ls -tl

total 68
drwxr-xr-x 1 root root  4096 Jan 18 14:21 sample_data
-rw-r--r-- 1 root root 62728 Apr 10  2022 1800.csv


!head -5 1800.csv

ITE00100554,18000101,TMAX,-75,,,E,
ITE00100554,18000101,TMIN,-148,,,E,
GM000010962,18000101,PRCP,0,,,E,
EZE00100082,18000101,TMAX,-86,,,E,
EZE00100082,18000101,TMIN,-135,,,E,


import pandas as pd

pd_df = pd.read_csv(
    "1800.csv",
    names=["stationID", "date", "measure_type", "temperature"],
    usecols=[0, 1, 2, 3]
)


pd_df.head()


# 'measure_type' 열이 "TMIN"인 행들을 필터하여 새로운 DataFrame을 만듭니다.
pd_minTemps = pd_df[pd_df['measure_type'] == "TMIN"]


pd_minTemps.head()


# Select only stationID and temperature
pd_stationTemps = pd_minTemps[["stationID", "temperature"]]


pd_minTemps.head()


# Select only stationID and temperature
pd_stationTemps = pd_minTemps[["stationID", "temperature"]]


# Aggregate to find minimum temperature for every station
pd_minTempsByStation = pd_stationTemps.groupby(["stationID"]).min("temperature")
pd_minTempsByStation.head()


from pyspark.sql import SparkSession
from pyspark import SparkConf

# Spark 애플리케이션을 구성하기 위해 SparkConf 객체를 생성합니다.
conf = SparkConf()

# 애플리케이션 이름을 설정합니다.
conf.set("spark.app.name", "PySpark DataFrame #1")

# 모든 가능한 코어를 사용하여 로컬에서 Spark를 실행하기 위해 마스터 URL을 설정합니다.
conf.set("spark.master", "local[*]")

# 지정된 구성으로 SparkSession을 생성합니다.
# config(conf=conf): SparkConf 객체에서 지정된 구성을 설정합니다.
# getOrCreate(): 기존의 SparkSession을 가져오거나 없으면 새로 생성합니다.
spark = SparkSession.builder\
        .config(conf=conf)\
        .getOrCreate()


df = spark.read.format("csv").load("1800.csv") # spark.read.csv("1800.csv")


# DataFrame의 스키마를 출력
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)


# PySpark를 사용하여 CSV 파일을 읽어와 DataFrame으로 변환하는 작업
df = spark.read.format("csv")\
    .load("1800.csv")\
    .toDF("stationID", "date", "measure_type", "temperature", "_c4", "_c5", "_c6", "_c7")


df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: string (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)


df = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .load("1800.csv")\
    .toDF("stationID", "date", "measure_type", "temperature", "_c4", "_c5", "_c6", "_c7")


df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: integer (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)


from pyspark.sql.types import StringType, IntegerType, FloatType
from pyspark.sql.types import StructType, StructField

# 스키마 정의
schema = StructType([ \
                     StructField("stationID", StringType(), True), \
                     StructField("date", IntegerType(), True), \
                     StructField("measure_type", StringType(), True), \
                     StructField("temperature", FloatType(), True)])


# PySpark에서 CSV 파일을 읽어올 때 미리 정의한 스키마를 사용하여 DataFrame을 생성하려는 시도
# df = spark.read.schema(schema).format("csv").load("1800.csv")
df = spark.read.schema(schema).csv("1800.csv")


df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)


# 필터 방법 1: TMIN으로 필터
minTemps = df.filter(df.measure_type == "TMIN")


minTemps.count()

730


# 필터 방법 2: Column expression으로 필터링 적용
minTemps = df.where(df.measure_type == "TMIN")


minTemps.count()

730


# 각 기상 관측소에서 최소 온도를 찾기 위해 groupBy와 min 함수를 사용합니다.
minTempsByStation = minTemps.groupBy("stationID").min("temperature")
# 결과를 출력합니다.
minTempsByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+


# "stationID"와 "temperature" 열만 선택합니다.
stationTemps = minTemps[["stationID", "temperature"]]


stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows


stationTemps = minTemps.select("stationID", "temperature")


# 결과를 수집하여 로컬 Python 리스트로 반환합니다.
results = minTempsByStation.collect()


for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	-148.00F
EZE00100082	-135.00F


df.createOrReplaceTempView("station1800")


results = spark.sql("""SELECT stationID, MIN(temperature)
FROM station1800
WHERE measure_type = 'TMIN'
GROUP BY 1""").collect()


# pyspark.sql.Row는 DataFrame의 레코드에 해당하며 필드별로 이름이 존재
for r in results:
    print(r)

Row(stationID='ITE00100554', min(temperature)=-148.0)
Row(stationID='EZE00100082', min(temperature)=-135.0)


# DataFrame의 컬럼을 지칭하는 4가지 방식
from pyspark.sql.functions import col, column
stationTemps = minTemps.select(
 "stationID",
 col("stationID"),
 column("stationID"),
 minTemps.stationID
)

UDF(User Defined Function) 사용해보기. (1)	2024.01.25
spark sql에서 join (0)	2024.01.25
spark 데이터프레임 실습5 (0)	2024.01.24
spark 데이터프레임 실습4 (0)	2024.01.24
spark 데이터프레임 실습3 (0)	2024.01.24

spark 데이터프레임 실습5 (0)	2024.01.24
spark 데이터프레임 실습4 (0)	2024.01.24
spark 데이터프레임 실습2 (0)	2024.01.23
spark 데이터프레임 실습1 (0)	2024.01.23
spark 기초 실습(colab) (0)	2024.01.19

spark 데이터프레임 실습3 (0)	2024.01.24
spark 데이터프레임 실습2 (0)	2024.01.23
spark 기초 실습(colab) (0)	2024.01.19
spark 데이터 구조 (0)	2024.01.19
spark 데이터 처리 (0)	2024.01.18

세상의 모든 데이터

DataFrame

spark SQL

'하둡,spark' 카테고리의 다른 글

spark 데이터프레임 실습3

'하둡,spark' 카테고리의 다른 글

spark 데이터프레임 실습1

'하둡,spark' 카테고리의 다른 글

+ Recent posts

티스토리툴바

	stationID	date	measure_type	temperature
0	ITE00100554	18000101	TMAX	-75
1	ITE00100554	18000101	TMIN	-148
2	GM000010962	18000101	PRCP	0
3	EZE00100082	18000101	TMAX	-86
4	EZE00100082	18000101	TMIN	-135