split.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. #!/usr/bin/env python3
  2. import argparse
  3. import sys
  4. import stringzilla
  5. from stringzilla import File, Str
  6. def parse_arguments():
  7. parser = argparse.ArgumentParser(
  8. description="Output pieces of FILE to PREFIXaa, PREFIXab, ...; default size is 1000 lines, and default PREFIX is 'x'."
  9. )
  10. parser.add_argument(
  11. "file", nargs="?", default="-", help='File to process, "-" for standard input'
  12. )
  13. parser.add_argument(
  14. "prefix", nargs="?", default="X", help='Output file prefix, default is "x"'
  15. )
  16. parser.add_argument(
  17. "-l",
  18. "--lines",
  19. type=int,
  20. default=1000,
  21. help="Number of lines per output file, default is 1000",
  22. )
  23. parser.add_argument(
  24. "-t",
  25. "--separator",
  26. default="\n",
  27. help="Use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character",
  28. )
  29. parser.add_argument(
  30. "-n",
  31. "--number",
  32. type=int,
  33. default=None,
  34. help="Generate N output files based on size of input",
  35. )
  36. parser.add_argument("--version", action="version", version=stringzilla.__version__)
  37. return parser.parse_args()
  38. def split_file(file_path, lines_per_file, output_prefix, separator, number_of_files):
  39. try:
  40. if separator == "\\0":
  41. separator = "\0"
  42. if file_path == "-":
  43. file_contents = Str(sys.stdin.read())
  44. else:
  45. file_mapped = File(file_path)
  46. file_contents = Str(file_mapped)
  47. if number_of_files is not None:
  48. total_length = len(file_contents)
  49. chunk_size = total_length // number_of_files
  50. for file_part in range(number_of_files):
  51. start = file_part * chunk_size
  52. end = (
  53. start + chunk_size
  54. if file_part < number_of_files - 1
  55. else total_length
  56. )
  57. current_slice = file_contents[start:end]
  58. output_path = f"{output_prefix}{file_part}"
  59. current_slice.write_to(output_path)
  60. return
  61. current_position = 0
  62. file_part = 0
  63. newline_position = -1
  64. while current_position < len(file_contents):
  65. for _ in range(lines_per_file):
  66. newline_position = file_contents.find(separator, newline_position + 1)
  67. if newline_position == -1:
  68. break
  69. if newline_position == -1 and current_position < len(file_contents):
  70. newline_position = len(file_contents)
  71. section_length = (
  72. newline_position - current_position if newline_position != -1 else 0
  73. )
  74. if section_length > 0:
  75. current_slice = file_contents[current_position : newline_position + 1]
  76. output_path = f"{output_prefix}{file_part}"
  77. current_slice.write_to(output_path)
  78. file_part += 1
  79. current_position = newline_position + 1
  80. except FileNotFoundError:
  81. print(f"No such file: {file_path}")
  82. except Exception as e:
  83. print(f"An error occurred: {e}")
  84. print("Usage example: split.py [-l LINES] [file] [prefix]")
  85. def main():
  86. args = parse_arguments()
  87. split_file(args.file, args.lines, args.prefix, args.separator, args.number)
  88. if __name__ == "__main__":
  89. main()