Skip to content

Commit f114943

Browse files
authored
Patch release (#60)
1 parent 2339424 commit f114943

File tree

11 files changed

+629
-547
lines changed

11 files changed

+629
-547
lines changed

Project.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
name = "ScientificTypes"
22
uuid = "321657f4-b219-11e9-178b-2701a2544e81"
33
authors = ["Anthony D. Blaom <[email protected]>"]
4-
version = "0.2.5"
4+
version = "0.2.6"
55

66
[deps]
77
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
88
ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
99
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1010

1111
[compat]
12-
CategoricalArrays = "^0.7"
12+
CategoricalArrays = "^0.7.3"
1313
ColorTypes = "^0.8"
1414
Tables = "^0.2"
1515
julia = "1"
1616

1717
[extras]
18+
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
19+
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
1820
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1921
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2022

2123
[targets]
22-
test = ["Random", "Test"]
24+
test = ["Random", "Test", "CSV", "DataFrames"]

docs/src/index.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,19 +100,15 @@ schema(Xfixed).scitypes
100100

101101
Note that, as it encountered missing values in `height` it coerced the type to `Union{Missing,Continuous}`.
102102

103+
Finally there is a `coerce!` method that does in-place coercion provided the data structure allows it (at the moment only `DataFrames.DataFrame` is supported).
103104

104105
## Notes
105106

106-
- We regard the built-in julia type `Missing` as a scientific type. The new scientific types introduced in the current package are rooted in the abstract type `Found` (see tree above) and you export the alias `Scientific = Union{Missing, Found}`.
107-
108-
- `Finite{N}`, `Muliticlass{N}` and `OrderedFactor{N}` are all parameterized by the number of levels `N`. We export the alias `Binary = Finite{2}`.
109-
110-
- `Image{W,H}`, `GrayImage{W,H}` and `ColorImage{W,H}` are all parameterized by the image width and height dimensions, `(W, H)`.
111-
107+
- We regard the built-in Julia type `Missing` as a scientific type. The new scientific types introduced in the current package are rooted in the abstract type `Found` (see tree above) and you export the alias `Scientific = Union{Missing, Found}`.
108+
- `Finite{N}`, `Multiclass{N}` and `OrderedFactor{N}` are all parametrised by the number of levels `N`. We export the alias `Binary = Finite{2}`.
109+
- `Image{W,H}`, `GrayImage{W,H}` and `ColorImage{W,H}` are all parametrised by the image width and height dimensions, `(W, H)`.
112110
- The function `scitype` has the fallback value `Unknown`.
113-
114111
- Since Tables is an optional dependency, the `scitype` of a [`Tables.jl`](https://github.com/JuliaData/Tables.jl) supported table is `Unknown` unless Tables has been imported.
115-
116112
- Developers can define their own conventions using the code in `src/conventions/mlj/` as a template. The active convention is controlled by the value of `ScientificTypes.CONVENTION[1]`.
117113

118114

@@ -282,6 +278,13 @@ It is important to note that the order in which the rules are specified matters;
282278
autotype(X; rules=(:few_to_finite,))
283279
```
284280

281+
Finally, you can also use the following shorthands:
282+
283+
```julia
284+
autotype(X, :few_to_finite)
285+
autotype(X, (:few_to_finite, :discrete_to_continuous))
286+
```
287+
285288
### Available rules
286289

287290
Rule symbol | scitype suggestion

src/ScientificTypes.jl

Lines changed: 19 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ export Scientific, Found, Unknown, Finite, Infinite
44
export OrderedFactor, Multiclass, Count, Continuous
55
export Binary, Table
66
export ColorImage, GrayImage
7-
export scitype, scitype_union, scitypes, coerce, schema
7+
export scitype, scitype_union, scitypes, elscitype, coerce, coerce!, schema
88
export mlj
99
export autotype
1010

@@ -107,183 +107,31 @@ function Table(Ts...)
107107
return Table{<:Union{[AbstractVector{<:T} for T in Ts]...}}
108108
end
109109

110-
111-
# ## THE SCITYPE FUNCTION
112-
113-
"""
114-
scitype(X)
115-
116-
The scientific type that `x` may represent.
117-
"""
118-
scitype(X) = scitype(X, Val(convention()))
119-
scitype(X, C) = scitype(X, C, Val(trait(X)))
120-
scitype(X, C, ::Val{:other}) = Unknown
121-
122-
scitype(::Missing) = Missing
123-
124-
# ## CONVENIENCE METHOD FOR UNIONS OVER ELEMENTS
125-
126-
"""
127-
scitype_union(A)
128-
129-
Return the type union, over all elements `x` generated by the iterable
130-
`A`, of `scitype(x)`.
131-
132-
See also `scitype`.
133-
"""
134-
scitype_union(A) = reduce((a,b)->Union{a,b}, (scitype(el) for el in A))
135-
136-
137-
# ## SCITYPES OF TUPLES
138-
139-
scitype(t::Tuple, ::Val) = Tuple{scitype.(t)...}
140-
141-
142-
# ## SCITYPES OF ARRAYS
143-
144-
"""
145-
ScientificTypes.Scitype(::Type, C::Val)
146-
147-
Method for implementers of a conventions to enable speed-up of scitype
148-
evaluations for large arrays.
149-
150-
In general, one cannot infer the scitype of an object of type
151-
`AbstractArray{T, N}` from the machine type alone. For, example, this
152-
never holds in the *mlj* convention for a categorical array, or in the
153-
following examples: `X=Any[1, 2, 3]` and `X=Union{Missing,Int64}[1, 2,
154-
3]`.
155-
156-
Nevertheless, for some *restricted* machine types `U`, the statement
157-
`type(X) == AbstractArray{T, N}` for some `T<:U` already allows one
158-
deduce that `scitype(X) = AbstractArray{S,N}`, where `S` is determined
159-
by `U` alone. This is the case in the *mlj* convention, for example,
160-
if `U = Integer`, in which case `S = Count`. If one explicitly declares
161-
162-
ScientificTypes.Scitype(::Type{<:U}, ::Val{:convention}) = S
163-
164-
in such cases, then ScientificTypes ensures a considerable speed-up in
165-
the computation of `scitype(X)`. There is also a partial speed-up for
166-
the case that `T <: Union{U, Missing}`.
167-
168-
For example, in *mlj* one has `Scitype(::Type{<:Integer}) = Count`.
169-
170-
"""
171-
Scitype(::Type, C::Val) = nothing
172-
Scitype(::Type{Any}, C::Val) = nothing # b/s `Any` isa `Union{<:Any, Missing}`
173-
174-
# For all such `T` we can also get almost the same speed-up in the case that
175-
# `T` is replaced by `Union{T, Missing}`, which we detect by wrapping
176-
# the answer:
177-
178-
Scitype(MT::Type{Union{T, Missing}}, C::Val) where T = Val(Scitype(T, C))
179-
180-
# For example, in *mlj* convention, Scitype(::Integer) = Count
181-
182-
const Arr{T,N} = AbstractArray{T,N}
183-
184-
# the dispatcher:
185-
scitype(A::Arr{T}, C) where T = scitype(A, C, Scitype(T, C))
186-
187-
# the slow fallback:
188-
scitype(A::Arr{<:Any,N}, ::Val, ::Nothing) where N =
189-
AbstractArray{scitype_union(A),N}
190-
191-
# the speed-up:
192-
scitype(::Arr{<:Any,N}, ::Val, S) where N = Arr{S,N}
193-
194-
# partial speed-up for missing types, because broadcast is faster than
195-
# computing scitype_union:
196-
function scitype(A::Arr{<:Any,N}, C::Val, ::Val{S}) where {N,S}
197-
if S == nothing
198-
return scitype(A, C, S)
199-
else
200-
Atight = broadcast(identity, A)
201-
if typeof(A) == typeof(Atight)
202-
return Arr{Union{S,Missing},N}
203-
else
204-
return Arr{S,N}
205-
end
206-
end
207-
end
208-
209-
210-
# ## STUB FOR COERCE METHOD
211-
212110
"""
213-
coerce(A::AbstractArray, T; verbosity=1)
214-
215-
Coerce the julia types of elements of `A` to ensure the returned array
216-
has `T` or `Union{Missing,T}` as the union of its element scitypes,
217-
according to the active convention.
218-
219-
A warning is issued if missing values are encountered, unless
220-
`verbosity` is `0` or less.
111+
is_type(obj, spkg, stype)
221112
222-
julia> mlj()
223-
julia> v = coerce([1, missing, 5], Continuous)
224-
3-element Array{Union{Missing, Float64},1}:
225-
1.0
226-
missing
227-
5.0
228-
229-
julia> scitype(v)
230-
AbstractArray{Union{Missing,Continuous}, 1}
231-
232-
See also [`scitype`](@ref), [`scitype_union`](@ref).
113+
This is a way to check that an object `obj` is of a given type that may come
114+
from a package that is not loaded in the current environment.
115+
For instance, say `DataFrames` is not loaded in the current environment, a
116+
function from some package could still return a DataFrame in which case you
117+
can check this with
233118
119+
```
120+
is_type(obj, :DataFrames, :DataFrame)
121+
```
234122
"""
235-
function coerce end
236-
237-
238-
# ## TABLE SCHEMA
239-
240-
struct Schema{names, types, scitypes, nrows} end
241-
242-
Schema(names::Tuple{Vararg{Symbol}}, types::Type{T}, scitypes::Type{S}, nrows::Integer) where {T<:Tuple,S<:Tuple} = Schema{names, T, S, nrows}()
243-
Schema(names, types, scitypes, nrows) = Schema{Tuple(Base.map(Symbol, names)), Tuple{types...}, Tuple{scitypes...}, nrows}()
244-
245-
function Base.getproperty(sch::Schema{names, types, scitypes, nrows}, field::Symbol) where {names, types, scitypes, nrows}
246-
if field === :names
247-
return names
248-
elseif field === :types
249-
return types === nothing ? nothing : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
250-
elseif field === :scitypes
251-
return scitypes === nothing ? nothing : Tuple(fieldtype(scitypes, i) for i = 1:fieldcount(scitypes))
252-
elseif field === :nrows
253-
return nrows === nothing ? nothing : nrows
254-
else
255-
throw(ArgumentError("unsupported property for ScientificTypes.Schema"))
256-
end
123+
function is_type(obj, spkg::Symbol, stype::Symbol)
124+
# If the package is loaded, then it will just be `stype`
125+
# otherwise it will be `spkg.stype`
126+
rx = Regex("^($spkg\\.)?$stype")
127+
match(rx, "$(typeof(obj))") === nothing || return true
128+
return false
257129
end
258130

259-
Base.propertynames(sch::Schema) = (:names, :types, :scitypes, :nrows)
260-
261-
_as_named_tuple(s::Schema) = NamedTuple{(:names, :types, :scitypes, :nrows)}((s.names, s.types, s.scitypes, s.nrows))
262-
263-
function Base.show(io::IO, ::MIME"text/plain", s::Schema)
264-
show(io, MIME("text/plain"), _as_named_tuple(s))
265-
end
266-
267-
268-
"""
269-
schema(X)
270-
271-
Inspect the column types and scitypes of a table.
272-
273-
julia> X = (ncalls=[1, 2, 4], mean_delay=[2.0, 5.7, 6.0])
274-
julia> schema(X)
275-
(names = (:ncalls, :mean_delay),
276-
types = (Int64, Float64),
277-
scitypes = (Count, Continuous))
278-
279-
"""
280-
schema(X) = schema(X, Val(trait(X)))
281-
schema(X, ::Val{:other}) =
282-
throw(ArgumentError("Cannot inspect the internal scitypes of "*
283-
"an object with trait `:other`\n"*
284-
"Perhaps you meant to import Tables first?"))
285131

286-
include("tables.jl")
132+
include("scitype.jl")
133+
include("schema.jl")
134+
include("coerce.jl")
287135
include("autotype.jl")
288136

289137
## ACTIVATE DEFAULT CONVENTION

src/coerce.jl

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
function _coerce_col(X, name, types; args...)
2+
y = getproperty(X, name)
3+
if haskey(types, name)
4+
# HACK isa LazyArrays.ApplyArray, see issue #49
5+
if is_type(y, :LazyArrays, :ApplyArray)
6+
y = convert(Vector, y)
7+
end
8+
return coerce(y, types[name]; args...)
9+
else
10+
return y
11+
end
12+
end
13+
14+
"""
15+
coerce(X, col1=>scitype1, col2=>scitype2, ... ; verbosity=1)
16+
coerce(X, d::AbstractDict; verbosity=1)
17+
18+
Return a copy of the table `X` with the scitypes of the specified
19+
columns coerced to those specified, or to missing-value versions of
20+
these scitypes, with warnings issued (for positive `verbosity`).
21+
Alternatively, the specifications can be wrapped in a dictionary.
22+
23+
24+
### Example
25+
26+
```julia
27+
using CategoricalArrays, DataFrames, Tables
28+
X = DataFrame(name=["Siri", "Robo", "Alexa", "Cortana"],
29+
height=[152, missing, 148, 163],
30+
rating=[1, 5, 2, 1])
31+
coerce(X, :name=>Multiclass, :height=>Continuous, :rating=>OrderedFactor)
32+
33+
See also [`scitype`](@ref), [`schema`](@ref).
34+
```
35+
36+
"""
37+
function coerce(X, pairs::Pair{Symbol}...; verbosity=1)
38+
trait(X) == :table ||
39+
error("Non-tabular data encountered or Tables pkg not loaded.")
40+
names = Tables.schema(X).names
41+
dpairs = Dict(pairs)
42+
X_ct = Tables.columntable(X)
43+
ct_new = (_coerce_col(X_ct, col, dpairs; verbosity=verbosity) for col in names)
44+
return Tables.materializer(X)(NamedTuple{names}(ct_new))
45+
end
46+
coerce(X, types::Dict; kw_args...) = coerce(X, (p for p in types)...; kw_args...)
47+
48+
49+
"""
50+
coerce!(X, ...)
51+
52+
Same as [`coerce`](@ref) except it does the modification in place provided `X`
53+
supports in-place modification (at the moment, only the DataFrame! does).
54+
An error is thrown otherwise. The arguments are the same as `coerce`.
55+
"""
56+
function coerce!(X, args...; kwargs...)
57+
# DataFrame --> coerce_dataframe! (see convention)
58+
is_type(X, :DataFrames, :DataFrame) && return coerce_df!(X, args...; kwargs...)
59+
# Everything else
60+
throw(ArgumentError("In place coercion not supported for $(typeof(X)). Try `coerce` instead."))
61+
end
62+
coerce!(X, types::Dict; kwargs...) = coerce!(X, (p for p in types)..., kwargs...)
63+
64+
function coerce_df!(df, pairs::Pair{Symbol}...; verbosity=1)
65+
names = Tables.schema(df).names
66+
types = Dict(pairs)
67+
for name in names
68+
name in keys(types) || continue
69+
# for DataFrames >= 0.19 df[!, name] = coerce(df[!, name], types(name))
70+
# but we want something that works more robustly... even for older DataFrames
71+
# the only way to do this is to use the `df.name = something` but we cannot use
72+
# setindex! which will throw a deprecation warning...
73+
name_str = "$name"
74+
ex = quote
75+
$df.$name = coerce($df.$name, $types[Symbol($name_str)])
76+
end
77+
eval(ex)
78+
end
79+
return df
80+
end

0 commit comments

Comments
 (0)