11
11
import numpy as np
12
12
import pytest
13
13
14
- from pandas ._config import using_string_dtype
15
-
16
14
import pandas .util ._test_decorators as td
17
15
18
16
import pandas as pd
@@ -435,9 +433,8 @@ def test_write_dta6(self, datapath, temp_file):
435
433
check_index_type = False ,
436
434
)
437
435
438
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
439
436
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
440
- def test_read_write_dta10 (self , version , temp_file ):
437
+ def test_read_write_dta10 (self , version , temp_file , using_infer_string ):
441
438
original = DataFrame (
442
439
data = [["string" , "object" , 1 , 1.1 , np .datetime64 ("2003-12-25" )]],
443
440
columns = ["string" , "object" , "integer" , "floating" , "datetime" ],
@@ -451,9 +448,11 @@ def test_read_write_dta10(self, version, temp_file):
451
448
original .to_stata (path , convert_dates = {"datetime" : "tc" }, version = version )
452
449
written_and_read_again = self .read_dta (path )
453
450
454
- expected = original [:]
451
+ expected = original . copy ()
455
452
# "tc" convert_dates means we store in ms
456
453
expected ["datetime" ] = expected ["datetime" ].astype ("M8[ms]" )
454
+ if using_infer_string :
455
+ expected ["object" ] = expected ["object" ].astype ("str" )
457
456
458
457
tm .assert_frame_equal (
459
458
written_and_read_again .set_index ("index" ),
@@ -1276,7 +1275,6 @@ def test_categorical_ordering(self, file, datapath):
1276
1275
assert parsed [col ].cat .ordered
1277
1276
assert not parsed_unordered [col ].cat .ordered
1278
1277
1279
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" , strict = False )
1280
1278
@pytest .mark .filterwarnings ("ignore::UserWarning" )
1281
1279
@pytest .mark .parametrize (
1282
1280
"file" ,
@@ -1340,6 +1338,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame:
1340
1338
if cat .categories .dtype == object :
1341
1339
categories = pd .Index ._with_infer (cat .categories ._values )
1342
1340
cat = cat .set_categories (categories )
1341
+ elif cat .categories .dtype == "string" and len (cat .categories ) == 0 :
1342
+ # if the read categories are empty, it comes back as object dtype
1343
+ categories = cat .categories .astype (object )
1344
+ cat = cat .set_categories (categories )
1343
1345
from_frame [col ] = cat
1344
1346
return from_frame
1345
1347
@@ -1369,7 +1371,6 @@ def test_iterator(self, datapath):
1369
1371
from_chunks = pd .concat (itr )
1370
1372
tm .assert_frame_equal (parsed , from_chunks )
1371
1373
1372
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" , strict = False )
1373
1374
@pytest .mark .filterwarnings ("ignore::UserWarning" )
1374
1375
@pytest .mark .parametrize (
1375
1376
"file" ,
@@ -1674,12 +1675,11 @@ def test_inf(self, infval, temp_file):
1674
1675
path = temp_file
1675
1676
df .to_stata (path )
1676
1677
1677
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1678
1678
def test_path_pathlib (self ):
1679
1679
df = DataFrame (
1680
1680
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1681
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1682
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1681
+ columns = pd .Index (list ("ABCD" )),
1682
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1683
1683
)
1684
1684
df .index .name = "index"
1685
1685
reader = lambda x : read_stata (x ).set_index ("index" )
@@ -1699,13 +1699,12 @@ def test_value_labels_iterator(self, write_index, temp_file):
1699
1699
value_labels = dta_iter .value_labels ()
1700
1700
assert value_labels == {"A" : {0 : "A" , 1 : "B" , 2 : "C" , 3 : "E" }}
1701
1701
1702
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1703
1702
def test_set_index (self , temp_file ):
1704
1703
# GH 17328
1705
1704
df = DataFrame (
1706
1705
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1707
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1708
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1706
+ columns = pd .Index (list ("ABCD" )),
1707
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1709
1708
)
1710
1709
df .index .name = "index"
1711
1710
path = temp_file
@@ -1733,9 +1732,9 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
1733
1732
formatted = df .loc [0 , column + "_fmt" ]
1734
1733
assert unformatted == formatted
1735
1734
1736
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1735
+ # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1737
1736
@pytest .mark .parametrize ("byteorder" , ["little" , "big" ])
1738
- def test_writer_117 (self , byteorder , temp_file ):
1737
+ def test_writer_117 (self , byteorder , temp_file , using_infer_string ):
1739
1738
original = DataFrame (
1740
1739
data = [
1741
1740
[
@@ -1802,6 +1801,9 @@ def test_writer_117(self, byteorder, temp_file):
1802
1801
expected = original [:]
1803
1802
# "tc" for convert_dates means we store with "ms" resolution
1804
1803
expected ["datetime" ] = expected ["datetime" ].astype ("M8[ms]" )
1804
+ if using_infer_string :
1805
+ # object dtype (with only strings/None) comes back as string dtype
1806
+ expected ["object" ] = expected ["object" ].astype ("str" )
1805
1807
1806
1808
tm .assert_frame_equal (
1807
1809
written_and_read_again .set_index ("index" ),
@@ -1845,15 +1847,14 @@ def test_invalid_date_conversion(self, temp_file):
1845
1847
with pytest .raises (ValueError , match = msg ):
1846
1848
original .to_stata (path , convert_dates = {"wrong_name" : "tc" })
1847
1849
1848
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1849
1850
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
1850
1851
def test_nonfile_writing (self , version , temp_file ):
1851
1852
# GH 21041
1852
1853
bio = io .BytesIO ()
1853
1854
df = DataFrame (
1854
1855
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1855
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1856
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1856
+ columns = pd .Index (list ("ABCD" )),
1857
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1857
1858
)
1858
1859
df .index .name = "index"
1859
1860
path = temp_file
@@ -1864,13 +1865,12 @@ def test_nonfile_writing(self, version, temp_file):
1864
1865
reread = read_stata (path , index_col = "index" )
1865
1866
tm .assert_frame_equal (df , reread )
1866
1867
1867
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1868
1868
def test_gzip_writing (self , temp_file ):
1869
1869
# writing version 117 requires seek and cannot be used with gzip
1870
1870
df = DataFrame (
1871
1871
1.1 * np .arange (120 ).reshape ((30 , 4 )),
1872
- columns = pd .Index (list ("ABCD" ), dtype = object ),
1873
- index = pd .Index ([f"i-{ i } " for i in range (30 )], dtype = object ),
1872
+ columns = pd .Index (list ("ABCD" )),
1873
+ index = pd .Index ([f"i-{ i } " for i in range (30 )]),
1874
1874
)
1875
1875
df .index .name = "index"
1876
1876
path = temp_file
@@ -1907,8 +1907,7 @@ def test_unicode_dta_118_119(self, file, datapath):
1907
1907
1908
1908
tm .assert_frame_equal (unicode_df , expected )
1909
1909
1910
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
1911
- def test_mixed_string_strl (self , temp_file ):
1910
+ def test_mixed_string_strl (self , temp_file , using_infer_string ):
1912
1911
# GH 23633
1913
1912
output = [{"mixed" : "string" * 500 , "number" : 0 }, {"mixed" : None , "number" : 1 }]
1914
1913
output = DataFrame (output )
@@ -1925,6 +1924,8 @@ def test_mixed_string_strl(self, temp_file):
1925
1924
output .to_stata (path , write_index = False , convert_strl = ["mixed" ], version = 117 )
1926
1925
reread = read_stata (path )
1927
1926
expected = output .fillna ("" )
1927
+ if using_infer_string :
1928
+ expected ["mixed" ] = expected ["mixed" ].astype ("str" )
1928
1929
tm .assert_frame_equal (reread , expected )
1929
1930
1930
1931
@pytest .mark .parametrize ("version" , [114 , 117 , 118 , 119 , None ])
@@ -2000,7 +2001,6 @@ def test_stata_119(self, datapath):
2000
2001
reader ._ensure_open ()
2001
2002
assert reader ._nvar == 32999
2002
2003
2003
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
2004
2004
@pytest .mark .parametrize ("version" , [118 , 119 , None ])
2005
2005
@pytest .mark .parametrize ("byteorder" , ["little" , "big" ])
2006
2006
def test_utf8_writer (self , version , byteorder , temp_file ):
@@ -2348,13 +2348,12 @@ def test_iterator_errors(datapath, chunksize):
2348
2348
pass
2349
2349
2350
2350
2351
- @pytest .mark .xfail (using_string_dtype (), reason = "TODO(infer_string)" )
2352
2351
def test_iterator_value_labels (temp_file ):
2353
2352
# GH 31544
2354
2353
values = ["c_label" , "b_label" ] + ["a_label" ] * 500
2355
2354
df = DataFrame ({f"col{ k } " : pd .Categorical (values , ordered = True ) for k in range (2 )})
2356
2355
df .to_stata (temp_file , write_index = False )
2357
- expected = pd .Index (["a_label" , "b_label" , "c_label" ], dtype = "object" )
2356
+ expected = pd .Index (["a_label" , "b_label" , "c_label" ])
2358
2357
with read_stata (temp_file , chunksize = 100 ) as reader :
2359
2358
for j , chunk in enumerate (reader ):
2360
2359
for i in range (2 ):
0 commit comments