Skip to content

Commit 1fac831

Browse files
authored
Update data, and raw option, add docs + tests (#1)
1 parent c991627 commit 1fac831

File tree

4 files changed

+205
-71
lines changed

4 files changed

+205
-71
lines changed

Project.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "PalmerPenguins"
22
uuid = "8b842266-38fa-440a-9b57-31493939ab85"
33
authors = ["David Widmann <[email protected]>"]
4-
version = "0.1.0"
4+
version = "0.1.1"
55

66
[deps]
77
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
@@ -14,8 +14,9 @@ julia = "1"
1414

1515
[extras]
1616
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
17+
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1718
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1819
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
1920

2021
[targets]
21-
test = ["DataFrames", "Tables", "Test"]
22+
test = ["DataFrames", "Dates", "Tables", "Test"]

README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,28 +33,32 @@ More information about the dataset is available in
3333

3434
## Usage
3535

36-
The dataset can be loaded by
37-
36+
The simplified version of the raw data can be loaded by
3837
```julia
3938
julia> using PalmerPenguins
4039

4140
julia> table = PalmerPenguins.load()
4241
```
43-
44-
This will download the dataset once using [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl),
42+
This will download the data (both the simplified and the raw version) once using
43+
[DataDeps.jl](https://github.com/oxinabox/DataDeps.jl),
4544
displaying the information above and making it available for future use. The function
4645
`PalmerPenguins.load()` returns a
4746
[`CSV.File`](https://juliadata.github.io/CSV.jl/stable/#CSV.File) object that supports the
4847
[Tables.jl](https://github.com/JuliaData/Tables.jl) interface. For instance, it can be
4948
converted to a
5049
[`DataFrame`](https://juliadata.github.io/DataFrames.jl/stable/man/getting_started/#The-DataFrame-Type-1)
5150
by executing
52-
5351
```julia
5452
julia> using DataFrames
5553

56-
julia> df = DataFrame(table)
54+
julia> df = DataFrame(table; copycols = false) # or: df = DataFrame!(table)
55+
```
56+
57+
The raw data can be loaded with
58+
```julia
59+
julia> tableraw = PalmerPenguins.load(; raw = true)
5760
```
61+
If `raw = false` (the default), then the simplified version is returned.
5862

5963
## Bibliography
6064

src/PalmerPenguins.jl

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ import CSV
44
import DataDeps
55

66
const DEPNAME = "PalmerPenguins"
7-
const LINK = "https://cdn.jsdelivr.net/gh/allisonhorst/palmerpenguins@f13c212d42c1341167368dc225df191c5a6029b9/data-raw/"
8-
const DATA = "penguins.csv"
7+
const LINK = "https://cdn.jsdelivr.net/gh/allisonhorst/palmerpenguins@433439c8b013eff3d36c847bb7a27fa0d7e353d8/inst/extdata/"
8+
const DATA = ["penguins.csv", "penguins_raw.csv"]
99
const INFO =
1010
"""
1111
Dataset: The Palmer penguins dataset
@@ -14,7 +14,7 @@ Website: https://allisonhorst.github.io/palmerpenguins/index.html
1414
1515
The Palmer penguins dataset is a dataset for data exploration & visualization, as an
1616
alternative to the Iris dataset.
17-
17+
1818
The dataset contains data for 344 penguins. There are 3 different species of penguins in
1919
this dataset, collected from 3 islands in the Palmer Archipelago, Antarctica.
2020
@@ -36,17 +36,35 @@ Please include this citation if you plan to use this database:
3636
e90081. https://doi.org/10.1371/journal.pone.0090081
3737
"""
3838

39-
function load()
40-
file = DataDeps.@datadep_str DEPNAME * "/" * DATA
41-
return CSV.File(file; missingstring="NA")
39+
"""
40+
load([; raw = false])
41+
42+
Load the Palmer penguins dataset.
43+
44+
If `raw` is `true`, then the raw data is returned. Otherwise the simplified version of
45+
the data is loaded.
46+
47+
# References
48+
49+
[Gorman et al., 2014]
50+
Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism and Environmental
51+
Variability within a Community of Antarctic Penguins (Genus Pygoscelis). PLoS ONE 9(3):
52+
e90081. https://doi.org/10.1371/journal.pone.0090081
53+
"""
54+
function load(; raw::Bool = false)
55+
file = DataDeps.@datadep_str DEPNAME * "/" * DATA[1 + Int(raw)]
56+
return CSV.File(
57+
file;
58+
missingstring="NA", truestrings=["Yes"], falsestrings=["No"], dateformat="y-m-d",
59+
)
4260
end
4361

4462
function __init__()
4563
DataDeps.register(DataDeps.DataDep(
4664
DEPNAME,
4765
INFO,
48-
LINK * DATA,
49-
"97d467baa3522040aa892fa7f2ff57b5195be5fef3cceca3f78a6b1a6e32d7a2",
66+
LINK .* DATA,
67+
"839b058be09b164f7dfa0d030a959cb3c5426dea281e8a34d34c557e398bd01f",
5068
))
5169
end
5270

test/runtests.jl

Lines changed: 166 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,70 +2,181 @@ using PalmerPenguins
22
using CSV
33
using DataFrames
44
using Tables
5+
6+
using Dates
57
using Test
68

79
# Always accept download in CI tests
810
ENV["DATADEPS_ALWAYS_ACCEPT"] = true
911

1012
@testset "PalmerPenguins.jl" begin
13+
# column names
14+
colnames_simplified = [
15+
"species",
16+
"island",
17+
"bill_length_mm",
18+
"bill_depth_mm",
19+
"flipper_length_mm",
20+
"body_mass_g",
21+
"sex",
22+
]
23+
coleltypes_simplified = [
24+
String,
25+
String,
26+
Union{Missing,Float64},
27+
Union{Missing,Float64},
28+
Union{Missing,Int64},
29+
Union{Missing,Int64},
30+
Union{Missing,String},
31+
]
32+
firstrow_simplified = [
33+
"Adelie",
34+
"Torgersen",
35+
39.1,
36+
18.7,
37+
Int64(181),
38+
Int64(3750),
39+
"male"
40+
]
41+
42+
colnames_raw = [
43+
"studyName",
44+
"Sample Number",
45+
"Species",
46+
"Region",
47+
"Island",
48+
"Stage",
49+
"Individual ID",
50+
"Clutch Completion",
51+
"Date Egg",
52+
"Culmen Length (mm)",
53+
"Culmen Depth (mm)",
54+
"Flipper Length (mm)",
55+
"Body Mass (g)",
56+
"Sex",
57+
"Delta 15 N (o/oo)",
58+
"Delta 13 C (o/oo)",
59+
"Comments",
60+
]
61+
coleltypes_raw = [
62+
String,
63+
Int64,
64+
String,
65+
String,
66+
String,
67+
String,
68+
Bool,
69+
Date,
70+
Union{Missing,Float64},
71+
Union{Missing,Float64},
72+
Union{Missing,Int64},
73+
Union{Missing,Int64},
74+
Union{Missing,String},
75+
Float64,
76+
Float64,
77+
String,
78+
]
79+
firstrow_raw = [
80+
"PAL0708",
81+
Int64(1),
82+
"Adelie Penguin (Pygoscelis adeliae)",
83+
"Anvers",
84+
"Torgersen",
85+
"Adult, 1 Egg Stage",
86+
"N1A1",
87+
true,
88+
Date(2007, 11, 11),
89+
39.1,
90+
18.7,
91+
Int64(181),
92+
Int64(3750),
93+
"MALE",
94+
missing,
95+
missing,
96+
"Not enough blood for isotopes.",
97+
]
98+
1199
@testset "load" begin
12-
table = PalmerPenguins.load()
13-
14-
# Check some properties
15-
@test table isa CSV.File
16-
@test length(table) == 344
17-
@test Tables.columnnames(Tables.columns(table)) == [
18-
:species,
19-
:island,
20-
:bill_length_mm,
21-
:bill_depth_mm,
22-
:flipper_length_mm,
23-
:body_mass_g,
24-
:sex,
25-
]
26-
27-
# Check first row
28-
firstrow = first(table)
29-
@test firstrow.species == "Adelie"
30-
@test firstrow.island == "Torgersen"
31-
@test firstrow.bill_length_mm == 39.1
32-
@test firstrow.bill_depth_mm == 18.7
33-
@test firstrow.flipper_length_mm == 181
34-
@test firstrow.body_mass_g == 3750
35-
@test firstrow.sex == "male"
100+
@testset "simplified" begin
101+
table = PalmerPenguins.load()
102+
table2 = PalmerPenguins.load(; raw = false)
103+
for i in 1:length(table)
104+
@test all(table[i] .=== table2[i])
105+
end
106+
107+
# Check some properties
108+
@test table isa CSV.File
109+
@test length(table) == 344
110+
@test Tables.columnnames(Tables.columns(table)) == Symbol.(colnames_simplified)
111+
for (name, T) in zip(colnames_simplified, coleltypes_simplified)
112+
eltype(Tables.getcolumn(Tables.columns(table), Symbol(name))) === T
113+
end
114+
115+
# Check first row
116+
firstrow = first(table)
117+
for i in 1:length(firstrow)
118+
@test firstrow[i] === firstrow_simplified[i]
119+
end
120+
end
121+
122+
@testset "raw" begin
123+
table = PalmerPenguins.load(; raw = true)
124+
125+
# Check some properties
126+
@test table isa CSV.File
127+
@test length(table) == 344
128+
@test Tables.columnnames(Tables.columns(table)) == Symbol.(colnames_raw)
129+
for (name, T) in zip(colnames_raw, coleltypes_raw)
130+
eltype(Tables.getcolumn(Tables.columns(table), Symbol(name))) === T
131+
end
132+
133+
# Check first row
134+
firstrow = first(table)
135+
for i in 1:length(firstrow)
136+
@test firstrow[i] === firstrow_raw[i]
137+
end
138+
end
36139
end
37140

38141
@testset "DataFrames" begin
39-
df = DataFrame(PalmerPenguins.load())
40-
41-
# Check some properties
42-
@test df isa DataFrame
43-
@test nrow(df) == 344
44-
@test names(df) == [
45-
"species",
46-
"island",
47-
"bill_length_mm",
48-
"bill_depth_mm",
49-
"flipper_length_mm",
50-
"body_mass_g",
51-
"sex",
52-
]
53-
@test eltype(df[!, :species]) === String
54-
@test eltype(df[!, :island]) === String
55-
@test eltype(df[!, :bill_length_mm]) === Union{Missing,Float64}
56-
@test eltype(df[!, :bill_depth_mm]) === Union{Missing,Float64}
57-
@test eltype(df[!, :flipper_length_mm]) === Union{Missing,Int64}
58-
@test eltype(df[!, :body_mass_g]) === Union{Missing,Int64}
59-
@test eltype(df[!, :sex]) === Union{Missing,String}
60-
61-
# Check first row
62-
firstrow = df[1, :]
63-
@test firstrow.species == "Adelie"
64-
@test firstrow.island == "Torgersen"
65-
@test firstrow.bill_length_mm == 39.1
66-
@test firstrow.bill_depth_mm == 18.7
67-
@test firstrow.flipper_length_mm == 181
68-
@test firstrow.body_mass_g == 3750
69-
@test firstrow.sex == "male"
142+
@testset "simplified" begin
143+
df = DataFrame!(PalmerPenguins.load())
144+
df2 = DataFrame!(PalmerPenguins.load(; raw = false))
145+
for i in 1:size(df, 2)
146+
@test all(df[!, i] .=== df2[!, i])
147+
end
148+
149+
# Check some properties
150+
@test df isa DataFrame
151+
@test nrow(df) == 344
152+
@test names(df) == colnames_simplified
153+
for (name, T) in zip(colnames_simplified, coleltypes_simplified)
154+
eltype(df[!, name]) === T
155+
end
156+
157+
# Check first row
158+
firstrow = df[1, :]
159+
for i in 1:length(firstrow)
160+
@test firstrow[i] === firstrow_simplified[i]
161+
end
162+
end
163+
164+
@testset "raw" begin
165+
df = DataFrame!(PalmerPenguins.load(; raw = true))
166+
167+
# Check some properties
168+
@test df isa DataFrame
169+
@test nrow(df) == 344
170+
@test names(df) == colnames_raw
171+
for (name, T) in zip(colnames_raw, coleltypes_raw)
172+
eltype(df[!, name]) === T
173+
end
174+
175+
# Check first row
176+
firstrow = df[1, :]
177+
for i in 1:length(firstrow)
178+
@test firstrow[i] === firstrow_raw[i]
179+
end
180+
end
70181
end
71182
end

0 commit comments

Comments
 (0)