Skip to content

read_csv2_chunked() needs to adjust locale like read_csv2() #1468

Open
@dpprdan

Description

@dpprdan

read_csv2_chunked() does not parse decimals correctly with the default_locale().

library(readr)
tf <- tempfile()
write_csv2(head(mtcars), tf)

Just to make sure: This really is a CSV2 and read_csv2() reads the data as expected

read_lines(tf) |> head(2)
#> [1] "mpg;cyl;disp;hp;drat;wt;qsec;vs;am;gear;carb"
#> [2] "21,0;6;160;110;3,90;2,620;16,46;0;1;4;4"
read_csv2(tf)
#> ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
#> Rows: 6 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ";"
#> dbl (11): mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 6 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> 4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
#> 5  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
#> 6  18.1     6   225   105  2.76  3.46  20.2     1     0     3     1

with default_locale(), i.e. decimal_mark = ".", the decimals are not parsed as decimals but as integers

read_csv2_chunked(tf, DataFrameCallback$new(data.frame), chunk_size = 3)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_number(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_number(),
#>   wt = col_number(),
#>   qsec = col_number(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#>   mpg cyl disp  hp drat   wt qsec vs am gear carb index
#> 1 210   6  160 110  390 2620 1646  0  1    4    4     1
#> 2 210   6  160 110  390 2875 1702  0  1    4    4     1
#> 3 228   4  108  93  385 2320 1861  1  1    4    1     1
#> 4 214   6  258 110  308 3215 1944  1  0    3    1     4
#> 5 187   8  360 175  315 3440 1702  0  0    3    2     4
#> 6 181   6  225 105  276 3460 2022  1  0    3    1     4

All is fine when setting the adequate locale.

read_csv2_chunked(tf, DataFrameCallback$new(data.frame), chunk_size = 3, locale = locale(decimal_mark = ",", grouping_mark = "."))
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb index
#> 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4     1
#> 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4     1
#> 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1     1
#> 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1     4
#> 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2     4
#> 6 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1     4

Long story short, the default locale setting is wrong.
This could be fixed easily with a default_locale2()/locale(decimal_mark = ",", grouping_mark = ".") default. (cf #1445)

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions