NYC Taxi EDA with DuckDB

Author

Lindsay Alston June 9, 2025

Introduction

This report walks through Exploratory Data Analysis (EDA) on the January 2015 NYC Yellow Taxi dataset (Parquet) using DuckDB in R. All code is included and executed inline.

Setup

# Load required packages
library(DBI)
library(duckdb)
library(dplyr)
library(ggplot2)

Download & Load Data

parquet_url  <- "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2015-01.parquet"
parquet_file <- "yellow_tripdata_2015-01.parquet"
if (!file.exists(parquet_file)) download.file(parquet_url, parquet_file, mode = "wb")

zones_url  <- "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv"
zones_file <- "taxi_zone_lookup.csv"
if (!file.exists(zones_file)) download.file(zones_url, zones_file, mode = "wb")

con <- dbConnect(duckdb::duckdb(), dbdir=":memory:")
dbExecute(con, "DROP TABLE IF EXISTS taxi;")
[1] 0
dbExecute(con, sprintf("CREATE TABLE taxi AS SELECT * FROM '%s';", parquet_file))
[1] 12741035

Table Overview

cat("Rows in raw taxi table: ", dbGetQuery(con, "SELECT COUNT(*) AS n FROM taxi")$n, "\n")
Rows in raw taxi table:  12741035 
# Schema
schema <- dbGetQuery(con, "PRAGMA table_info(taxi)")
knitr::kable(schema, caption = "Taxi Table Schema")
Taxi Table Schema
cid name type notnull dflt_value pk
0 VendorID BIGINT FALSE NA FALSE
1 tpep_pickup_datetime TIMESTAMP FALSE NA FALSE
2 tpep_dropoff_datetime TIMESTAMP FALSE NA FALSE
3 passenger_count BIGINT FALSE NA FALSE
4 trip_distance DOUBLE FALSE NA FALSE
5 RatecodeID BIGINT FALSE NA FALSE
6 store_and_fwd_flag VARCHAR FALSE NA FALSE
7 PULocationID BIGINT FALSE NA FALSE
8 DOLocationID BIGINT FALSE NA FALSE
9 payment_type BIGINT FALSE NA FALSE
10 fare_amount DOUBLE FALSE NA FALSE
11 extra DOUBLE FALSE NA FALSE
12 mta_tax DOUBLE FALSE NA FALSE
13 tip_amount DOUBLE FALSE NA FALSE
14 tolls_amount DOUBLE FALSE NA FALSE
15 improvement_surcharge DOUBLE FALSE NA FALSE
16 total_amount DOUBLE FALSE NA FALSE
17 congestion_surcharge INTEGER FALSE NA FALSE
18 airport_fee INTEGER FALSE NA FALSE
# Sample Rows
sample_data <- dbGetQuery(con, "SELECT * FROM taxi LIMIT 5")
knitr::kable(sample_data, caption = "First 5 Rows of Taxi Table")
First 5 Rows of Taxi Table
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge airport_fee
1 2015-01-01 00:11:33 2015-01-01 00:16:48 1 1.0 1 N 41 166 1 5.7 0.5 0.5 1.40 0 0 8.40 NA NA
1 2015-01-01 00:18:24 2015-01-01 00:24:20 1 0.9 1 N 166 238 3 6.0 0.5 0.5 0.00 0 0 7.30 NA NA
1 2015-01-01 00:26:19 2015-01-01 00:41:06 1 3.5 1 N 238 162 1 13.2 0.5 0.5 2.90 0 0 17.40 NA NA
1 2015-01-01 00:45:26 2015-01-01 00:53:20 1 2.1 1 N 162 263 1 8.2 0.5 0.5 2.37 0 0 11.87 NA NA
1 2015-01-01 00:59:21 2015-01-01 01:05:24 1 1.0 1 N 236 141 3 6.0 0.5 0.5 0.00 0 0 7.30 NA NA

Clean Data

dbExecute(con, "DROP TABLE IF EXISTS taxi_clean;")
[1] 0
dbExecute(con, "
  CREATE TABLE taxi_clean AS
  SELECT *,
    CAST(tpep_pickup_datetime AS DATE) AS trip_date
  FROM taxi
  WHERE fare_amount > 0
    AND trip_distance > 0
    AND passenger_count > 0;")
[1] 12656488

Aggregate & Visualize

daily_stats <- dbGetQuery(con, "
  SELECT
    trip_date,
    COUNT(*)           AS n_trips,
    AVG(trip_distance) AS avg_dist,
    AVG(fare_amount)   AS avg_fare,
    AVG(tip_amount)    AS avg_tip
  FROM taxi_clean
  GROUP BY trip_date
  ORDER BY trip_date;")

# Line plot
ggplot(daily_stats, aes(x = trip_date, y = avg_fare)) +
  geom_line() +
  labs(
    title = "NYC Yellow Taxi: Avg Fare per Day (Jan 2015)",
    x     = "Date",
    y     = "Average Fare (USD)"
  ) +
  theme_minimal()

Zone Summary

zones <- read.csv(zones_file)
dbWriteTable(con, "zones", zones, overwrite = TRUE)

zone_summary <- dbGetQuery(con, "
  SELECT
    z.Borough,
    COUNT(*) AS trips
  FROM taxi_clean t
  JOIN zones z
    ON t.PULocationID = z.LocationID
  GROUP BY z.Borough
  ORDER BY trips DESC;")

knitr::kable(zone_summary, caption = "Trips by Borough")
Trips by Borough
Borough trips
Manhattan 11563086
Queens 625394
Unknown 227053
Brooklyn 226854
Bronx 8979
N/A 4755
EWR 213
Staten Island 154

Cleanup

dbDisconnect(con, shutdown = TRUE)