Skip to content

Commit dc3a2cd

Browse files
authored
Batch: Rework Interface Into Queue, Task, File Objects (#3898)
* Rework batch interface to accept batch_job_submit(struct batch_task *t). This will allow us to pass properly formed file objects and make use of more features of taskvine from makeflow and the factory. * Makeflow now uses batch_job_submit(queue,task) as promised long ago. * Blue Waters was decomissioned in 2023. * Remove largely unused batch_fs operations. * Clean up batch_queue structures. * Revert batch_fs operations back to basic unix operations. * Remove unused functions, format for consistency. * Separate batch_job_info into a separate file. * Use batch_queue consistently to describe a queue. * batch_task -> batch_job * Update copyright boilerplate * Update batch_queue_cluster.c to new interface. * Update batch_queue_amazon.c to new interface. * Add back in batch_queue_amazon and batch_queue_cluster. * Replace throughout: batch_job -> batch_queue batch_task -> batch_job * Remove mesos from work_queue_factory. * Remove mesos support. * Remove MPI from makeflow. * Remove unused makeflow-mpi support. * Fix up batch_queue_k8s by transforming file lists to strings in batch_queue_k8s_submit. * Fix incorrect ordering of inner file / outer file. * batch_fs_unlink is replaced by unlink_recursive, not just unlink (caught by TR_makeflow_restart.sh) * Produce a factory log in test. * Do not attach worker log as output unless debugging is requested. * Move retired modules to old * Clean up makefile to incorporate new headers. * Complete autodocs for batch_job/file/queue/wrapper * Remove amazon_batch, lambda, and mesos options from makeflow. * Remove Amazon Batch, Lambda, Mesos, and MPI from man pages. * Remove Amazon Batch, Lambda, Mesos, MPI from manual. * explicitely -> explicitly * Remove unused chirp symbol * Make autodocs example complete with input file. * Remove unused items from makeflow man page. * Reorder and doc batch system types. * General renaming of sge to uge in code and manuals. sge is retained as an alias for uge in batch queue selection. * sge -> uge in vine_submit_workers * sge -> uge * format and lint batch_job module * Apply clang formatting rules. * Add "experimental" message for batch modules in which we have lower confidence. * queue -> remote_queue * Fix up condor afs error message * experimental flag should be a feature not an option
1 parent c92bbd3 commit dc3a2cd

File tree

100 files changed

+3575
-5733
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+3575
-5733
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,13 @@ install: $(INSTALL_PACKAGES)
4949
test: $(CCTOOLS_PACKAGES)
5050
./run_all_tests.sh
5151

52-
PACKAGES_TO_LINT = taskvine dttools poncho resource_monitor
52+
PACKAGES_TO_LINT = taskvine resource_monitor batch_job dttools poncho
5353
LINT_PACKAGES = $(PACKAGES_TO_LINT:%=lint-%)
5454
$(LINT_PACKAGES): config.mk
5555
@$(MAKE) -C $(@:lint-%=%) lint
5656
lint: $(LINT_PACKAGES)
5757

58-
PACKAGES_TO_FORMAT = taskvine resource_monitor dttools
58+
PACKAGES_TO_FORMAT = taskvine resource_monitor batch_job dttools
5959
FORMAT_PACKAGES = $(PACKAGES_TO_FORMAT:%=format-%)
6060
$(FORMAT_PACKAGES): config.mk
6161
@$(MAKE) -C $(@:format-%=%) format

batch_job/src/Makefile

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,21 @@ VINE_LIB=${CCTOOLS_HOME}/taskvine/src/manager/libtaskvine.a
1313
EXTERNAL_LIBRARIES=$(CHIRP_LIB) $(DT_LIB)
1414

1515
SOURCES = \
16+
batch_queue.c \
17+
batch_job.c \
18+
batch_job_info.c \
1619
batch_file.c \
17-
batch_task.c \
1820
batch_wrapper.c \
19-
batch_job.c \
20-
batch_job_amazon.c \
21-
batch_job_amazon_batch.c \
22-
batch_job_dryrun.c \
23-
$(CHIRP_BATCH) \
24-
batch_job_cluster.c \
25-
batch_job_blue_waters.c \
26-
batch_job_condor.c \
27-
batch_job_local.c \
28-
batch_job_work_queue.c\
29-
batch_job_lambda.c \
30-
batch_job_work_queue.c \
31-
batch_job_vine.c \
32-
batch_job_mesos.c \
33-
batch_job_k8s.c \
34-
batch_job_mpi.c \
35-
mesos_task.c
36-
37-
PUBLIC_HEADERS = batch_job.h
21+
batch_queue_local.c \
22+
batch_queue_dryrun.c \
23+
batch_queue_condor.c \
24+
batch_queue_vine.c \
25+
batch_queue_work_queue.c \
26+
batch_queue_cluster.c \
27+
batch_queue_k8s.c \
28+
batch_queue_amazon.c
29+
30+
PUBLIC_HEADERS = batch_queue.h batch_job.h batch_job_info.h batch_file.h batch_wrapper.h
3831

3932
OBJECTS = $(SOURCES:%.c=%.o)
4033

@@ -61,5 +54,17 @@ install:
6154
cp $(PUBLIC_HEADERS) $(CCTOOLS_INSTALL_DIR)/include/cctools
6255

6356
clean:
64-
rm -rf $(OBJECTS) $(LIBRARIES) $(PROGRAMS) batch_job_amazon_script.c *.o
57+
rm -rf $(OBJECTS) $(LIBRARIES) $(PROGRAMS) *.o
58+
59+
lint:
60+
if ( ! clang-format -Werror --dry-run --style='file:../../.clang-format' $(SOURCES));\
61+
then\
62+
echo "========================================================";\
63+
echo "NOTICE: Run `make format` to format your latest changes.";\
64+
echo "========================================================";\
65+
exit 1;\
66+
fi
67+
68+
format:
69+
clang-format -i $(SOURCES)
6570

batch_job/src/batch_file.c

Lines changed: 75 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
/*
2-
Copyright (C) 2022 The University of Notre Dame
2+
Copyright (C) 2024 The University of Notre Dame
33
This software is distributed under the GNU General Public License.
44
See the file COPYING for details.
55
*/
66

7+
#include "batch_queue.h"
8+
#include "batch_job.h"
79
#include "batch_file.h"
10+
811
#include "sha1.h"
912
#include "stringtools.h"
1013
#include "xxmalloc.h"
1114
#include "path.h"
1215
#include "hash_table.h"
16+
1317
#include <time.h>
1418
#include <sys/time.h>
1519
#include <sys/types.h>
@@ -26,26 +30,24 @@ double total_checksum_time = 0.0;
2630
* IF no inner_name is given, or the specified batch_queue does not support
2731
* remote renaming the outer_name will be used.
2832
**/
29-
struct batch_file *batch_file_create(struct batch_queue *queue, const char * outer_name, const char * inner_name)
33+
struct batch_file *batch_file_create(const char *outer_name, const char *inner_name)
3034
{
31-
struct batch_file *f = calloc(1,sizeof(*f));
32-
f->outer_name = xxstrdup(outer_name);
33-
34-
if(batch_queue_supports_feature(queue, "remote_rename") && inner_name){
35+
struct batch_file *f = calloc(1, sizeof(*f));
36+
f->outer_name = xxstrdup(outer_name);
37+
if (inner_name) {
3538
f->inner_name = xxstrdup(inner_name);
3639
} else {
3740
f->inner_name = xxstrdup(outer_name);
3841
}
39-
40-
return f;
42+
return f;
4143
}
4244

4345
/**
4446
* Delete batch_file, including freeing outer_name and inner_name/
4547
**/
4648
void batch_file_delete(struct batch_file *f)
4749
{
48-
if(!f)
50+
if (!f)
4951
return;
5052

5153
free(f->outer_name);
@@ -54,135 +56,123 @@ void batch_file_delete(struct batch_file *f)
5456
free(f);
5557
}
5658

57-
/**
58-
* Given a file, return the string that identifies it appropriately
59-
* for the given batch system, combining the local and remote name
60-
* and making substitutions according to the node.
61-
**/
62-
char * batch_file_to_string(struct batch_queue *queue, struct batch_file *f )
59+
char *batch_file_to_string(struct batch_file *f)
6360
{
64-
if(batch_queue_supports_feature(queue,"remote_rename")) {
65-
return string_format("%s=%s", f->outer_name, f->inner_name);
66-
} else {
67-
return string_format("%s", f->outer_name);
68-
}
61+
if (!strcmp(f->inner_name, f->outer_name)) {
62+
return strdup(f->outer_name);
63+
} else {
64+
return string_format("%s=%s", f->outer_name, f->inner_name);
65+
}
6966
}
7067

71-
/**
72-
* Given a list of files, add the files to the given string.
73-
* Returns the original string, realloced if necessary
74-
**/
75-
char * batch_files_to_string(struct batch_queue *queue, struct list *files )
68+
char *batch_file_list_to_string(struct list *file_list)
7669
{
77-
struct batch_file *file;
78-
79-
char * file_str = strdup("");
70+
struct batch_file *file;
8071

81-
char * separator = "";
72+
char *file_str = strdup("");
73+
char *separator = "";
8274

83-
if(!files) return file_str;
75+
if (!file_list)
76+
return file_str;
8477

85-
list_first_item(files);
86-
while((file=list_next_item(files))) {
78+
LIST_ITERATE(file_list, file)
79+
{
8780
/* Only add separator if past first item. */
88-
file_str = string_combine(file_str,separator);
81+
file_str = string_combine(file_str, separator);
8982

90-
char *f = batch_file_to_string(queue, file);
91-
file_str = string_combine(file_str,f);
92-
93-
/* This could be set using batch_queue feature or option
83+
char *f = batch_file_to_string(file);
84+
file_str = string_combine(file_str, f);
85+
86+
/* This could be set using batch_queue feature or option
9487
* to allow for batch system specific separators. */
9588
separator = ",";
9689

97-
free(f);
98-
}
90+
free(f);
91+
}
9992

100-
return file_str;
93+
return file_str;
10194
}
10295

103-
int batch_file_outer_compare(const void *file1, const void *file2) {
104-
struct batch_file **f1 = (void *)file1;
105-
struct batch_file **f2 = (void *)file2;
106-
107-
return strcmp((*f1)->outer_name, (*f2)->outer_name);
96+
int batch_file_outer_compare(struct batch_file *file1, struct batch_file *file2)
97+
{
98+
return strcmp(file1->outer_name, file2->outer_name);
10899
}
109100

110101
/* Return the content based ID for a file.
111102
* generates the checksum of a file's contents if does not exist */
112-
char * batch_file_generate_id(struct batch_file *f) {
113-
if(check_sums == NULL){
114-
check_sums = hash_table_create(0,0);
115-
}
103+
char *batch_file_generate_id(struct batch_file *f)
104+
{
105+
if (check_sums == NULL) {
106+
check_sums = hash_table_create(0, 0);
107+
}
116108
char *check_sum_value = hash_table_lookup(check_sums, f->outer_name);
117-
if(check_sum_value == NULL){
109+
if (check_sum_value == NULL) {
118110
unsigned char hash[SHA1_DIGEST_LENGTH];
119111
struct timeval start_time;
120-
struct timeval end_time;
112+
struct timeval end_time;
121113

122-
gettimeofday(&start_time,NULL);
114+
gettimeofday(&start_time, NULL);
123115
int success = sha1_file(f->outer_name, hash);
124-
gettimeofday(&end_time,NULL);
125-
double run_time = ((end_time.tv_sec*1000000 + end_time.tv_usec) - (start_time.tv_sec*1000000 + start_time.tv_usec)) / 1000000.0;
126-
total_checksum_time += run_time;
127-
debug(D_MAKEFLOW_HOOK," The total checksum time is %lf",total_checksum_time);
128-
if(success == 0){
116+
gettimeofday(&end_time, NULL);
117+
double run_time = ((end_time.tv_sec * 1000000 + end_time.tv_usec) - (start_time.tv_sec * 1000000 + start_time.tv_usec)) / 1000000.0;
118+
total_checksum_time += run_time;
119+
debug(D_MAKEFLOW_HOOK, " The total checksum time is %lf", total_checksum_time);
120+
if (success == 0) {
129121
debug(D_MAKEFLOW, "Unable to checksum this file: %s", f->outer_name);
130122
return NULL;
131123
}
132124
f->hash = xxstrdup(sha1_string(hash));
133125
hash_table_insert(check_sums, f->outer_name, xxstrdup(sha1_string(hash)));
134-
debug(D_MAKEFLOW,"Checksum hash of %s is: %s",f->outer_name,f->hash);
126+
debug(D_MAKEFLOW, "Checksum hash of %s is: %s", f->outer_name, f->hash);
135127
return xxstrdup(f->hash);
136128
}
137-
debug(D_MAKEFLOW,"Checksum already exists in hash table. Cached CHECKSUM hash of %s is: %s", f->outer_name, check_sum_value);
129+
debug(D_MAKEFLOW, "Checksum already exists in hash table. Cached CHECKSUM hash of %s is: %s", f->outer_name, check_sum_value);
138130
return xxstrdup(check_sum_value);
139131
}
140132

141-
142133
/* Return the content based ID for a directory.
143134
* generates the checksum for the directories contents if does not exist
144135
* *NEED TO ACCOUNT FOR SYMLINKS LATER* */
145-
char * batch_file_generate_id_dir(char *file_name){
146-
if(check_sums == NULL){
147-
check_sums = hash_table_create(0,0);
136+
char *batch_file_generate_id_dir(char *file_name)
137+
{
138+
if (check_sums == NULL) {
139+
check_sums = hash_table_create(0, 0);
148140
}
149141
char *check_sum_value = hash_table_lookup(check_sums, file_name);
150-
if(check_sum_value == NULL){
142+
if (check_sum_value == NULL) {
151143
char *hash_sum = "";
152144
struct dirent **dp;
153145
int num;
154146
// Scans directory and sorts in reverse order
155147
num = scandir(file_name, &dp, NULL, alphasort);
156-
if(num < 0){
157-
debug(D_MAKEFLOW,"Unable to scan %s", file_name);
148+
if (num < 0) {
149+
debug(D_MAKEFLOW, "Unable to scan %s", file_name);
158150
return NULL;
159-
}
160-
else{
151+
} else {
161152
int i;
162-
for(i = num - 1; i >= 0; i--) {
163-
if(strcmp(dp[i]->d_name,".") != 0 && strcmp(dp[i]->d_name,"..") != 0){
164-
char *file_path = string_format("%s/%s",file_name,dp[i]->d_name);
165-
if(path_is_dir(file_path) == 1){
166-
hash_sum = string_format("%s%s",hash_sum,batch_file_generate_id_dir(file_path));
167-
}
168-
else{
153+
for (i = num - 1; i >= 0; i--) {
154+
if (strcmp(dp[i]->d_name, ".") != 0 && strcmp(dp[i]->d_name, "..") != 0) {
155+
char *file_path = string_format("%s/%s", file_name, dp[i]->d_name);
156+
if (path_is_dir(file_path) == 1) {
157+
hash_sum = string_format("%s%s", hash_sum, batch_file_generate_id_dir(file_path));
158+
} else {
169159
unsigned char hash[SHA1_DIGEST_LENGTH];
170160
struct timeval start_time;
171-
struct timeval end_time;
161+
struct timeval end_time;
172162

173-
gettimeofday(&start_time,NULL);
163+
gettimeofday(&start_time, NULL);
174164
int success = sha1_file(file_path, hash);
175-
gettimeofday(&end_time,NULL);
176-
double run_time = ((end_time.tv_sec*1000000 + end_time.tv_usec) - (start_time.tv_sec*1000000 + start_time.tv_usec)) / 1000000.0;
177-
total_checksum_time += run_time;
178-
debug(D_MAKEFLOW_HOOK," The total checksum time is %lf",total_checksum_time);
179-
if(success == 0){
165+
gettimeofday(&end_time, NULL);
166+
double run_time = ((end_time.tv_sec * 1000000 + end_time.tv_usec) - (start_time.tv_sec * 1000000 + start_time.tv_usec)) / 1000000.0;
167+
total_checksum_time += run_time;
168+
debug(D_MAKEFLOW_HOOK, " The total checksum time is %lf", total_checksum_time);
169+
if (success == 0) {
180170
debug(D_MAKEFLOW, "Unable to checksum this file: %s", file_path);
181171
free(file_path);
182172
free(dp[i]);
183173
continue;
184174
}
185-
hash_sum = string_format("%s%s:%s",hash_sum,file_name,sha1_string(hash));
175+
hash_sum = string_format("%s%s:%s", hash_sum, file_name, sha1_string(hash));
186176
}
187177
free(file_path);
188178
}
@@ -193,10 +183,10 @@ char * batch_file_generate_id_dir(char *file_name){
193183
sha1_buffer(hash_sum, strlen(hash_sum), hash);
194184
free(hash_sum);
195185
hash_table_insert(check_sums, file_name, xxstrdup(sha1_string(hash)));
196-
debug(D_MAKEFLOW,"Checksum hash of %s is: %s",file_name,sha1_string(hash));
186+
debug(D_MAKEFLOW, "Checksum hash of %s is: %s", file_name, sha1_string(hash));
197187
return xxstrdup(sha1_string(hash));
198188
}
199189
}
200-
debug(D_MAKEFLOW,"Checksum already exists in hash table. Cached CHECKSUM hash of %s is: %s", file_name, check_sum_value);
190+
debug(D_MAKEFLOW, "Checksum already exists in hash table. Cached CHECKSUM hash of %s is: %s", file_name, check_sum_value);
201191
return check_sum_value;
202192
}

0 commit comments

Comments
 (0)