This report walks through Exploratory Data Analysis (EDA) on the January 2015 NYC Yellow Taxi dataset (Parquet) using DuckDB in R. All code is included and executed inline.
# Sample Rowssample_data <-dbGetQuery(con, "SELECT * FROM taxi LIMIT 5")knitr::kable(sample_data, caption ="First 5 Rows of Taxi Table")
First 5 Rows of Taxi Table
VendorID
tpep_pickup_datetime
tpep_dropoff_datetime
passenger_count
trip_distance
RatecodeID
store_and_fwd_flag
PULocationID
DOLocationID
payment_type
fare_amount
extra
mta_tax
tip_amount
tolls_amount
improvement_surcharge
total_amount
congestion_surcharge
airport_fee
1
2015-01-01 00:11:33
2015-01-01 00:16:48
1
1.0
1
N
41
166
1
5.7
0.5
0.5
1.40
0
0
8.40
NA
NA
1
2015-01-01 00:18:24
2015-01-01 00:24:20
1
0.9
1
N
166
238
3
6.0
0.5
0.5
0.00
0
0
7.30
NA
NA
1
2015-01-01 00:26:19
2015-01-01 00:41:06
1
3.5
1
N
238
162
1
13.2
0.5
0.5
2.90
0
0
17.40
NA
NA
1
2015-01-01 00:45:26
2015-01-01 00:53:20
1
2.1
1
N
162
263
1
8.2
0.5
0.5
2.37
0
0
11.87
NA
NA
1
2015-01-01 00:59:21
2015-01-01 01:05:24
1
1.0
1
N
236
141
3
6.0
0.5
0.5
0.00
0
0
7.30
NA
NA
Clean Data
dbExecute(con, "DROP TABLE IF EXISTS taxi_clean;")
[1] 0
dbExecute(con, " CREATE TABLE taxi_clean AS SELECT *, CAST(tpep_pickup_datetime AS DATE) AS trip_date FROM taxi WHERE fare_amount > 0 AND trip_distance > 0 AND passenger_count > 0;")
[1] 12656488
Aggregate & Visualize
daily_stats <-dbGetQuery(con, " SELECT trip_date, COUNT(*) AS n_trips, AVG(trip_distance) AS avg_dist, AVG(fare_amount) AS avg_fare, AVG(tip_amount) AS avg_tip FROM taxi_clean GROUP BY trip_date ORDER BY trip_date;")# Line plotggplot(daily_stats, aes(x = trip_date, y = avg_fare)) +geom_line() +labs(title ="NYC Yellow Taxi: Avg Fare per Day (Jan 2015)",x ="Date",y ="Average Fare (USD)" ) +theme_minimal()
Zone Summary
zones <-read.csv(zones_file)dbWriteTable(con, "zones", zones, overwrite =TRUE)zone_summary <-dbGetQuery(con, " SELECT z.Borough, COUNT(*) AS trips FROM taxi_clean t JOIN zones z ON t.PULocationID = z.LocationID GROUP BY z.Borough ORDER BY trips DESC;")knitr::kable(zone_summary, caption ="Trips by Borough")