blob: aeae9e32e6f5dd460e14779bdc8f92e1f9916005 [file] [log] [blame]
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""
Copies and renames a set of corpus files into a given directory.
"""
from absl import app
from absl import flags
from sys import stderr
import glob
import os
import shutil
FLAGS = flags.FLAGS
flags.DEFINE_list("corpus_list", [],
"Each element in the list stands for a corpus file")
flags.DEFINE_string("corpus_list_file", None,
"An optional file that lists corpus paths by lines")
flags.DEFINE_string("output_dir", None, "The path of the output directory")
flags.mark_flag_as_required("output_dir")
def expand_corpus_to_file_list(corpus, file_list):
if not os.path.exists(corpus):
raise FileNotFoundError("file " + corpus + " doesn't exist")
if os.path.isdir(corpus):
# The first element in glob("dir/**") is "dir/", which needs to be excluded
file_list.extend(glob.glob(os.path.join(corpus, "**"), recursive=True)[1:])
else:
file_list.append(corpus)
def main(argv):
if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir)
expanded_file_list = []
for corpus in FLAGS.corpus_list:
expand_corpus_to_file_list(corpus, expanded_file_list)
if FLAGS.corpus_list_file:
with open(FLAGS.corpus_list_file) as corpus_list_file:
for corpus_line in corpus_list_file:
expand_corpus_to_file_list(
corpus_line.rstrip("\n"), expanded_file_list)
if expanded_file_list:
for corpus in expanded_file_list:
dest = os.path.join(FLAGS.output_dir, corpus.replace("/", "-"))
# Whatever the separator we choose, there is an chance that
# the dest name conflicts with another file
if os.path.exists(dest):
print("ERROR: file " + dest + " existed.", file=stderr)
return -1
shutil.copy(corpus, dest)
else:
open(os.path.join(FLAGS.output_dir, "empty_test"), "a").close()
return 0
if __name__ == '__main__':
app.run(main)