@@ -11,42 +11,46 @@ class HDFTableCopy(BaseCopy):
11
11
12
12
def __init__ (
13
13
self ,
14
+ file_name ,
14
15
hdf_tables ,
15
- hdf_meta ,
16
16
defer_sql_objs = False ,
17
17
conn = None ,
18
18
table_obj = None ,
19
19
sql_table = None ,
20
20
csv_chunksize = 10 ** 6 ,
21
+ hdf_chunksize = 10 ** 7 ,
22
+ hdf_metadata = None ,
21
23
):
22
24
"""
23
25
Parameters
24
26
----------
27
+ file_name
25
28
hdf_tables: list of strings
26
29
HDF keys with data corresponding to destination SQL table
27
30
(assumption being that HDF tables:SQL tables is many:one)
28
- hdf_meta: HDFMetadata object
29
- Information from the HDF file for use in building copy objects
30
31
defer_sql_objs: bool
31
32
multiprocessing has issue with passing SQLALchemy objects, so if
32
33
True, defer attributing these to the object until after pickled by Pool
33
- conn: SQLAlchemy connection
34
+ conn: SQLAlchemy connection or None
34
35
Managed outside of the object
35
- table_obj: SQLAlchemy model object
36
+ table_obj: SQLAlchemy model object or None
36
37
Destination SQL Table
37
- sql_table: string
38
+ sql_table: string or None
38
39
SQL table name
39
40
csv_chunksize: int
40
41
Max rows to keep in memory when generating CSV for COPY
42
+ hdf_chunksize: int
43
+ Max rows to keep in memory when reading HDF file
44
+ hdf_metadata: dict or None
45
+ Dict of HDF table keys to dict of constant:value pairs. Not actively used by
46
+ any pre-defined function, but available to data_formatting method
41
47
"""
42
48
super ().__init__ (defer_sql_objs , conn , table_obj , sql_table , csv_chunksize )
43
49
44
50
self .hdf_tables = hdf_tables
45
-
46
- # Info from the HDFMetadata object
47
- self .hdf_metadata = hdf_meta .metadata_vars
48
- self .file_name = hdf_meta .file_name
49
- self .hdf_chunksize = hdf_meta .chunksize
51
+ self .hdf_metadata = hdf_metadata
52
+ self .file_name = file_name
53
+ self .hdf_chunksize = hdf_chunksize
50
54
51
55
def copy (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
52
56
"""
@@ -121,46 +125,6 @@ class SmallHDFTableCopy(HDFTableCopy):
121
125
in-memory for both reading from the HDF as well as COPYing using StringIO.
122
126
"""
123
127
124
- def __init__ (
125
- self ,
126
- hdf_tables ,
127
- hdf_meta ,
128
- defer_sql_objs = False ,
129
- conn = None ,
130
- table_obj = None ,
131
- sql_table = None ,
132
- csv_chunksize = 10 ** 6 ,
133
- ):
134
- """
135
- Parameters
136
- ----------
137
- hdf_tables: list of strings
138
- HDF keys with data corresponding to destination SQL table
139
- (assumption being that HDF tables:SQL tables is many:one)
140
- hdf_meta: HDFMetadata object
141
- Information from the HDF file for use in building copy objects
142
- defer_sql_objs: bool
143
- multiprocessing has issue with passing SQLALchemy objects, so if
144
- True, defer attributing these to the object until after pickled by Pool
145
- conn: SQLAlchemy connection
146
- Managed outside of the object
147
- table_obj: SQLAlchemy model object
148
- Destination SQL Table
149
- sql_table: string
150
- SQL table name
151
- csv_chunksize: int
152
- Max rows to keep in memory when generating CSV for COPY
153
- """
154
- super ().__init__ (
155
- hdf_tables ,
156
- hdf_meta ,
157
- defer_sql_objs ,
158
- conn ,
159
- table_obj ,
160
- sql_table ,
161
- csv_chunksize ,
162
- )
163
-
164
128
def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
165
129
"""
166
130
Copy each HDF table that relates to SQL table to database
@@ -206,46 +170,6 @@ class BigHDFTableCopy(HDFTableCopy):
206
170
pd.read_hdf(..., iterator=True) because we found the performance was much better.
207
171
"""
208
172
209
- def __init__ (
210
- self ,
211
- hdf_tables ,
212
- hdf_meta ,
213
- defer_sql_objs = False ,
214
- conn = None ,
215
- table_obj = None ,
216
- sql_table = None ,
217
- csv_chunksize = 10 ** 6 ,
218
- ):
219
- """
220
- Parameters
221
- ----------
222
- hdf_tables: list of strings
223
- HDF keys with data corresponding to destination SQL table
224
- (assumption being that HDF tables:SQL tables is many:one)
225
- hdf_meta: HDFMetadata object
226
- Information from the HDF file for use in building copy objects
227
- defer_sql_objs: bool
228
- multiprocessing has issue with passing SQLALchemy objects, so if
229
- True, defer attributing these to the object until after pickled by Pool
230
- conn: SQLAlchemy connection
231
- Managed outside of the object
232
- table_obj: SQLAlchemy model object
233
- Destination SQL Table
234
- sql_table: string
235
- SQL table name
236
- csv_chunksize: int
237
- Max rows to keep in memory when generating CSV for COPY
238
- """
239
- super ().__init__ (
240
- hdf_tables ,
241
- hdf_meta ,
242
- defer_sql_objs ,
243
- conn ,
244
- table_obj ,
245
- sql_table ,
246
- csv_chunksize ,
247
- )
248
-
249
173
def hdf_to_pg (self , data_formatters = [cast_pandas ], data_formatter_kwargs = {}):
250
174
"""
251
175
Copy each HDF table that relates to SQL table to database
@@ -275,7 +199,7 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
275
199
start = 0
276
200
277
201
for i in range (n_chunks ):
278
- logger .info ("*** HDF chunk {i + 1 } of {} ***" .format (n_chunks ))
202
+ logger .info ("*** HDF chunk {i} of {n } ***" .format (i = i + 1 , n = n_chunks ))
279
203
logger .info ("Reading HDF table" )
280
204
stop = min (start + self .hdf_chunksize , nrows )
281
205
df = pd .read_hdf (self .file_name , key = hdf_table , start = start , stop = stop )
0 commit comments