@@ -21,6 +21,8 @@ use datafusion_data_access::FileMeta;
21
21
use futures:: stream:: BoxStream ;
22
22
use futures:: { StreamExt , TryStreamExt } ;
23
23
use glob:: Pattern ;
24
+ use std:: borrow:: Cow ;
25
+ use std:: path:: { is_separator, MAIN_SEPARATOR } ;
24
26
use url:: Url ;
25
27
26
28
/// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`]
@@ -99,17 +101,25 @@ impl ListingTableUrl {
99
101
100
102
/// Returns the path as expected by [`ObjectStore`]
101
103
///
102
- /// In particular for file scheme URLs, this has a leading `/`
103
- /// and describes an absolute path on the local filesystem
104
+ /// In particular for file scheme URLs, this is an absolute
105
+ /// on the local filesystem in the OS-specific path representation
104
106
///
105
- /// For other URLs, this also contains the host component
106
- /// and lacks a leading `/`
107
+ /// For other URLs, this is a the host and path of the URL,
108
+ /// delimited by `/`, and with no leading `/`
107
109
///
108
110
/// TODO: Handle paths consistently (#2489)
109
- fn prefix ( & self ) -> & str {
111
+ fn prefix ( & self ) -> Cow < ' _ , str > {
110
112
match self . scheme ( ) {
111
- "file" => self . url . path ( ) ,
112
- _ => & self . url [ url:: Position :: BeforeHost ..url:: Position :: AfterPath ] ,
113
+ "file" => match MAIN_SEPARATOR {
114
+ '/' => Cow :: Borrowed ( self . url . path ( ) ) ,
115
+ _ => {
116
+ let path = self . url . to_file_path ( ) . unwrap ( ) ;
117
+ Cow :: Owned ( path. to_string_lossy ( ) . to_string ( ) )
118
+ }
119
+ } ,
120
+ _ => Cow :: Borrowed (
121
+ & self . url [ url:: Position :: BeforeHost ..url:: Position :: AfterPath ] ,
122
+ ) ,
113
123
}
114
124
}
115
125
@@ -119,10 +129,12 @@ impl ListingTableUrl {
119
129
& ' a self ,
120
130
path : & ' b str ,
121
131
) -> Option < impl Iterator < Item = & ' b str > + ' a > {
132
+ let prefix = self . prefix ( ) ;
122
133
// Ignore empty path segments
123
134
let diff = itertools:: diff_with (
124
- path. split ( '/' ) . filter ( |s| !s. is_empty ( ) ) ,
125
- self . prefix ( ) . split ( '/' ) . filter ( |s| !s. is_empty ( ) ) ,
135
+ // TODO: Handle paths consistently (#2489)
136
+ path. split ( is_separator) . filter ( |s| !s. is_empty ( ) ) ,
137
+ prefix. split ( is_separator) . filter ( |s| !s. is_empty ( ) ) ,
126
138
|a, b| a == b,
127
139
) ;
128
140
@@ -139,24 +151,27 @@ impl ListingTableUrl {
139
151
store : & ' a dyn ObjectStore ,
140
152
file_extension : & ' a str ,
141
153
) -> BoxStream < ' a , Result < FileMeta > > {
142
- futures:: stream:: once ( store. list_file ( self . prefix ( ) ) )
143
- . try_flatten ( )
144
- . map_err ( DataFusionError :: IoError )
145
- . try_filter ( move |meta| {
146
- let path = meta. path ( ) ;
147
-
148
- let extension_match = path. ends_with ( file_extension) ;
149
- let glob_match = match & self . glob {
150
- Some ( glob) => match path. strip_prefix ( self . url . path ( ) ) {
151
- Some ( stripped) => glob. matches ( stripped) ,
152
- None => false ,
153
- } ,
154
- None => true ,
155
- } ;
154
+ futures:: stream:: once ( async move {
155
+ let prefix = self . prefix ( ) ;
156
+ store. list_file ( prefix. as_ref ( ) ) . await
157
+ } )
158
+ . try_flatten ( )
159
+ . map_err ( DataFusionError :: IoError )
160
+ . try_filter ( move |meta| {
161
+ let path = meta. path ( ) ;
162
+
163
+ let extension_match = path. ends_with ( file_extension) ;
164
+ let glob_match = match & self . glob {
165
+ Some ( glob) => match path. strip_prefix ( self . url . path ( ) ) {
166
+ Some ( stripped) => glob. matches ( stripped) ,
167
+ None => false ,
168
+ } ,
169
+ None => true ,
170
+ } ;
156
171
157
- futures:: future:: ready ( extension_match && glob_match)
158
- } )
159
- . boxed ( )
172
+ futures:: future:: ready ( extension_match && glob_match)
173
+ } )
174
+ . boxed ( )
160
175
}
161
176
}
162
177
@@ -194,10 +209,32 @@ fn split_glob_expression(path: &str) -> Option<(&str, &str)> {
194
209
195
210
#[ cfg( test) ]
196
211
mod tests {
197
- use crate :: datasource:: listing:: path:: split_glob_expression;
212
+ use super :: * ;
213
+ use std:: path:: Path ;
214
+
215
+ #[ test]
216
+ fn test_prefix_path ( ) {
217
+ let parent = Path :: new ( "../" ) . canonicalize ( ) . unwrap ( ) ;
218
+ let url = ListingTableUrl :: parse ( parent. to_string_lossy ( ) ) . unwrap ( ) ;
219
+
220
+ let path = Path :: new ( "." ) . canonicalize ( ) . unwrap ( ) ;
221
+ let path = path. to_string_lossy ( ) ;
222
+
223
+ assert_eq ! ( url. strip_prefix( path. as_ref( ) ) . unwrap( ) . count( ) , 1 ) ;
224
+ }
225
+
226
+ #[ test]
227
+ fn test_prefix_s3 ( ) {
228
+ let url = ListingTableUrl :: parse ( "s3://bucket/foo/bar" ) . unwrap ( ) ;
229
+ assert_eq ! ( url. prefix( ) , "bucket/foo/bar" ) ;
230
+
231
+ let path = "bucket/foo/bar/partition/foo.parquet" ;
232
+ let prefix: Vec < _ > = url. strip_prefix ( path) . unwrap ( ) . collect ( ) ;
233
+ assert_eq ! ( prefix, vec![ "partition" , "foo.parquet" ] ) ;
234
+ }
198
235
199
- #[ tokio :: test]
200
- async fn test_split_glob ( ) {
236
+ #[ test]
237
+ fn test_split_glob ( ) {
201
238
fn test ( input : & str , expected : Option < ( & str , & str ) > ) {
202
239
assert_eq ! (
203
240
split_glob_expression( input) ,
0 commit comments