mirror of
https://github.com/originalmk/natural-merge-sort.git
synced 2024-10-03 12:06:10 +00:00
473 lines
14 KiB
Python
473 lines
14 KiB
Python
import os
|
|
import itertools
|
|
import math
|
|
from random import randint
|
|
|
|
BUFFER_SIZE = 32
|
|
SET_BYTES_SIZE = 15
|
|
RECORD_BYTES_SIZE = SET_BYTES_SIZE + 1
|
|
BYTES_BUFFER_SIZE = BUFFER_SIZE * RECORD_BYTES_SIZE
|
|
|
|
|
|
class ReadBuffer:
|
|
def __init__(self, file_path):
|
|
self.read_pos = 0
|
|
self.size = BUFFER_SIZE
|
|
self.loaded_size = 0
|
|
self.buffer = []
|
|
self.file_path = file_path
|
|
self.file_pos = 0
|
|
self.file_size = os.path.getsize(file_path)
|
|
self.disk_reads_count = 0
|
|
self.load_next()
|
|
|
|
# None if there is no next record
|
|
def read_next(self):
|
|
if not self.has_more():
|
|
return None
|
|
|
|
res_record = self.buffer[self.read_pos]
|
|
self.read_pos += 1
|
|
if self.read_pos == self.size:
|
|
self.load_next()
|
|
self.read_pos = 0
|
|
|
|
return res_record
|
|
|
|
|
|
def has_more(self):
|
|
return (self.file_pos < self.file_size
|
|
or self.read_pos < self.loaded_size)
|
|
|
|
def peek(self):
|
|
if self.read_pos == self.loaded_size:
|
|
return None
|
|
result = self.buffer[self.read_pos]
|
|
return result
|
|
|
|
def load_next(self):
|
|
self.buffer = []
|
|
# buffering=0 disables buffering, it is desired, because buffering
|
|
# is implemented here, in code
|
|
file = open(self.file_path, "rb", buffering=0)
|
|
file.seek(self.file_pos)
|
|
bytes_to_read = min(
|
|
BYTES_BUFFER_SIZE,
|
|
self.file_size - self.file_pos
|
|
)
|
|
temp_buffer = file.read(bytes_to_read)
|
|
|
|
if len(temp_buffer) % RECORD_BYTES_SIZE != 0:
|
|
raise Exception("Read bytes are not multiply of record size")
|
|
|
|
self.file_pos += bytes_to_read
|
|
self.loaded_size = bytes_to_read / RECORD_BYTES_SIZE
|
|
|
|
temp_ints = list(temp_buffer)
|
|
|
|
for i in range(len(temp_buffer) // RECORD_BYTES_SIZE):
|
|
record_ints = temp_ints[
|
|
RECORD_BYTES_SIZE * i:RECORD_BYTES_SIZE * (i + 1)
|
|
]
|
|
self.buffer.append(Record.load_from_ints(record_ints))
|
|
|
|
file.close()
|
|
self.disk_reads_count += 1
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
next_record = self.read_next()
|
|
if next_record is None:
|
|
raise StopIteration
|
|
return next_record
|
|
|
|
|
|
class WriteBuffer:
|
|
def __init__(self, file_path, append_mode=False):
|
|
self.write_pos = 0
|
|
self.size = BUFFER_SIZE
|
|
self.buffer = [None] * BUFFER_SIZE
|
|
self.file_path = file_path
|
|
if not append_mode and os.path.isfile(file_path):
|
|
os.remove(file_path)
|
|
self.runs_written = 0
|
|
self.last_written = None
|
|
self.disk_writes_count = 0
|
|
|
|
def write_next(self, record):
|
|
if record < self.last_written:
|
|
self.runs_written += 1
|
|
if self.write_pos == self.size:
|
|
self.flush()
|
|
self.buffer[self.write_pos] = record
|
|
self.write_pos += 1
|
|
self.last_written = record
|
|
|
|
def save_next(self):
|
|
ints_to_write = []
|
|
for record in self.buffer[0:self.write_pos]:
|
|
ints_to_write += record.save_to_ints() # type: ignore
|
|
|
|
file = open(self.file_path, "ab", buffering=0)
|
|
file.write(bytearray(ints_to_write))
|
|
file.close()
|
|
self.disk_writes_count += 1
|
|
|
|
def flush(self):
|
|
if self.write_pos > 0:
|
|
self.save_next()
|
|
self.write_pos = 0
|
|
|
|
|
|
class Record:
|
|
def __init__(self, items):
|
|
self.items = items
|
|
|
|
@staticmethod
|
|
def load_from_ints(record_ints):
|
|
set_length = record_ints[0]
|
|
set_items = record_ints[1:set_length + 1]
|
|
return Record(set_items)
|
|
|
|
def save_to_ints(self):
|
|
result = [len(self.items), *self.items]
|
|
zeropad = [0] * (RECORD_BYTES_SIZE - len(result))
|
|
result += zeropad
|
|
return result
|
|
|
|
def __repr__(self):
|
|
return f"Set {sorted(self.items, reverse=True)}"
|
|
|
|
def __lt__(self, other):
|
|
if other is None:
|
|
return True
|
|
|
|
self_items_copy = self.items[:]
|
|
other_items_copy = other.items[:]
|
|
|
|
for item in self_items_copy:
|
|
if item in other_items_copy:
|
|
self_items_copy.remove(item)
|
|
other_items_copy.remove(item)
|
|
|
|
if len(other_items_copy) == 0:
|
|
return False
|
|
elif len(self_items_copy) == 0:
|
|
return True
|
|
|
|
s_max = max(self_items_copy)
|
|
o_max = max(other_items_copy)
|
|
|
|
return o_max > s_max
|
|
|
|
|
|
class RunIterator:
|
|
def __init__(self, read_buffer):
|
|
self.read_buffer = read_buffer
|
|
self.current_record = None
|
|
self.end_of_run = False
|
|
|
|
def read_next(self):
|
|
if self.end_of_run:
|
|
return None
|
|
|
|
self.current_record = self.read_buffer.read_next()
|
|
|
|
if self.current_record is None:
|
|
return None
|
|
|
|
next_record = self.read_buffer.peek()
|
|
if next_record is not None and next_record < self.current_record:
|
|
self.end_of_run = True
|
|
|
|
return self.current_record
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
res_record = self.read_next()
|
|
if res_record is None:
|
|
raise StopIteration
|
|
return res_record
|
|
|
|
|
|
def print_tape(file_name):
|
|
print(f"[ .. ] Tape {file_name}\n")
|
|
|
|
buffer = ReadBuffer(file_name)
|
|
series_count = 0
|
|
records_count = 0
|
|
|
|
while buffer.has_more():
|
|
ri = RunIterator(buffer)
|
|
for record in ri:
|
|
print(record)
|
|
records_count += 1
|
|
series_count += 1
|
|
print("~ series end ~")
|
|
|
|
print(f"\n[ ^- ] Series count: {series_count}")
|
|
print(f"[ ^- ] Records count: {records_count}")
|
|
|
|
|
|
def print_runs(file_name, n):
|
|
print(f"Printing first {n} runs from {file_name}")
|
|
buff = ReadBuffer(file_name)
|
|
for i in range(n):
|
|
print(f"\nRun {i}:")
|
|
ri = RunIterator(buff)
|
|
for record in ri:
|
|
print(record)
|
|
|
|
|
|
def runs_count(file_name):
|
|
rc = 0
|
|
buff = ReadBuffer(file_name)
|
|
while buff.has_more():
|
|
ri = RunIterator(buff)
|
|
for _ in ri:
|
|
pass
|
|
rc += 1
|
|
return rc
|
|
|
|
|
|
# ==============================================================================
|
|
|
|
def prepare_tapes():
|
|
t1_dest = WriteBuffer("fs/t1")
|
|
for record in ReadBuffer("fs/start_tape"):
|
|
t1_dest.write_next(record)
|
|
t1_dest.flush()
|
|
|
|
if os.path.isfile("fs/t2"):
|
|
os.remove("fs/t2")
|
|
if os.path.isfile("fs/t3"):
|
|
os.remove("fs/t3")
|
|
|
|
|
|
class MetaInfo:
|
|
def __init__(self, reads_count, writes_count, runs_count):
|
|
self.reads_count = reads_count
|
|
self.writes_count = writes_count
|
|
self.runs_count = runs_count
|
|
|
|
|
|
def distribute(source_tape_path, first_dest_path, second_dest_path):
|
|
t1_buffer = ReadBuffer(source_tape_path)
|
|
t2_buffer = WriteBuffer(first_dest_path)
|
|
t3_buffer = WriteBuffer(second_dest_path)
|
|
|
|
last_record = t1_buffer.read_next()
|
|
t2_buffer.write_next(last_record)
|
|
dest_buffer = t2_buffer
|
|
|
|
i = 0
|
|
for record in t1_buffer:
|
|
# Not sorted pair of records
|
|
if record < last_record:
|
|
# Toggle
|
|
if dest_buffer == t2_buffer:
|
|
dest_buffer = t3_buffer
|
|
else:
|
|
dest_buffer = t2_buffer
|
|
i += 1
|
|
|
|
if dest_buffer == t2_buffer:
|
|
t2_buffer.write_next(record)
|
|
else:
|
|
t3_buffer.write_next(record)
|
|
|
|
last_record = record
|
|
|
|
t2_buffer.flush()
|
|
t3_buffer.flush()
|
|
|
|
return MetaInfo(t1_buffer.disk_reads_count,
|
|
t2_buffer.disk_writes_count + t3_buffer.disk_writes_count,
|
|
t2_buffer.runs_written + t3_buffer.runs_written)
|
|
|
|
|
|
# Read runs alternately from t2 and t3 (reading is done 1 record at a time, because we can't read whole run to memory)
|
|
# So one run may end and if it happens we need to write remaining records from non-empty run
|
|
# If any of t2 or t3 ends then just write all remaining runs from non-empty tape to t1
|
|
# For each two runs merge their records creating new run
|
|
# Write that run to t1
|
|
|
|
def merge_runs(rit1, rit2, write_buffer: WriteBuffer):
|
|
rit1_curr = rit1.read_next()
|
|
rit2_curr = rit2.read_next()
|
|
while rit1_curr is not None and rit2_curr is not None:
|
|
if rit1_curr < rit2_curr:
|
|
write_buffer.write_next(rit1_curr)
|
|
rit1_curr = rit1.read_next()
|
|
else:
|
|
write_buffer.write_next(rit2_curr)
|
|
rit2_curr = rit2.read_next()
|
|
if rit1_curr is not None:
|
|
write_buffer.write_next(rit1_curr)
|
|
for r in rit1:
|
|
write_buffer.write_next(r)
|
|
if rit2_curr is not None:
|
|
write_buffer.write_next(rit2_curr)
|
|
for r in rit2:
|
|
write_buffer.write_next(r)
|
|
|
|
|
|
def merge(first_source_path, second_source_path, dest_tape_path):
|
|
t1_buffer = WriteBuffer(dest_tape_path)
|
|
t2_buffer = ReadBuffer(first_source_path)
|
|
t3_buffer = ReadBuffer(second_source_path)
|
|
|
|
while t2_buffer.has_more() and t3_buffer.has_more():
|
|
merge_runs(RunIterator(t2_buffer), RunIterator(t3_buffer), t1_buffer)
|
|
|
|
for r in t2_buffer:
|
|
t1_buffer.write_next(r)
|
|
|
|
for r in t3_buffer:
|
|
t1_buffer.write_next(r)
|
|
|
|
t1_buffer.flush()
|
|
return MetaInfo(t2_buffer.disk_reads_count + t3_buffer.disk_reads_count,
|
|
t1_buffer.disk_writes_count,
|
|
t1_buffer.runs_written)
|
|
|
|
|
|
class SortInfo:
|
|
def __init__(self, reads_count, writes_count, phases_count):
|
|
self.reads_count = reads_count
|
|
self.writes_count = writes_count
|
|
self.phases_count = phases_count
|
|
|
|
|
|
def tape_sort(tape_path, print_after_phase=False):
|
|
runs_written = 0
|
|
phases_count = 0
|
|
reads_count = 0
|
|
writes_count = 0
|
|
while runs_written != 1:
|
|
dist_info = distribute(tape_path, "fs/t2", "fs/t3")
|
|
merge_info = merge("fs/t2", "fs/t3", tape_path)
|
|
runs_written = merge_info.runs_count
|
|
|
|
reads_count += dist_info.reads_count
|
|
reads_count += merge_info.reads_count
|
|
writes_count += dist_info.writes_count
|
|
writes_count += merge_info.writes_count
|
|
|
|
if print_after_phase:
|
|
print(f"[ -v ] Phase {phases_count + 1}")
|
|
print_tape(tape_path)
|
|
print(f"[ .. ] {runs_written} series remaining")
|
|
phases_count += 1
|
|
|
|
return SortInfo(reads_count, writes_count, phases_count)
|
|
|
|
help_page = """
|
|
help
|
|
displays the help page
|
|
clear <tape_path>
|
|
clears (removes) the tape
|
|
genrandom <path_to_tape> <records_count> [options]
|
|
adds record_count random records at the end of the tape
|
|
if 'o' is provided as an option then tape is overwritten with these
|
|
newly generated random records
|
|
add <path_to_tape> <numbers, ...>
|
|
adds new record (set created from 1 to 15 numbers provided in range 0-255)
|
|
at the end of the tape
|
|
load <path_to_tape> <path_to_test_file>
|
|
adds the records described in test file to the tape
|
|
sort <path_to_tape> [options]
|
|
sorts the provided tape displaying its contents before and after sorting
|
|
when option 'v' is provided then tape will be displayed after each
|
|
phase
|
|
display <path_to_tape>
|
|
displays the tape's records and information how many series and records it
|
|
contains. Also when displaying records it is shown when each of series
|
|
ends
|
|
exit
|
|
gracefully exits
|
|
"""
|
|
|
|
print("\nSorting using natural merge method (2+1)")
|
|
print("Author: Maciej Krzyżanowski\n")
|
|
should_run = True
|
|
while should_run:
|
|
cmd_line = input("> ")
|
|
match cmd_line.split():
|
|
case ["clear", tape_path]:
|
|
print(f"[ .. ] Clearing a tape {tape_path}")
|
|
if os.path.isfile(tape_path):
|
|
os.remove(tape_path)
|
|
print(f"[ :) ] Cleared the tape")
|
|
else:
|
|
print(f"[ :( ] There does not exist a tape at provided path")
|
|
case ["genrandom", tape_path, number_of_records, *options]:
|
|
if "o" in options:
|
|
write_buffer = WriteBuffer(tape_path)
|
|
else:
|
|
write_buffer = WriteBuffer(
|
|
tape_path, append_mode=True)
|
|
for i in range(int(number_of_records)):
|
|
set_length = randint(1, 15)
|
|
new_set = []
|
|
while len(new_set) != set_length:
|
|
new_suggestion = randint(0, 255)
|
|
if new_suggestion not in new_set:
|
|
new_set.append(new_suggestion)
|
|
new_record = Record(new_set)
|
|
write_buffer.write_next(new_record)
|
|
write_buffer.flush()
|
|
print(f"[ :) ] Added {number_of_records} new records to " +
|
|
f"tape {tape_path}")
|
|
case ["display", tape_path]:
|
|
print(f"[ :) ] Displaying tape {tape_path}")
|
|
print_tape(tape_path)
|
|
case ["add", tape_path, *set_elements]:
|
|
if len(set_elements) == 0:
|
|
print("[ :( ] No number was given")
|
|
continue
|
|
set_elements = [int(x) for x in set_elements]
|
|
new_record = Record(set_elements)
|
|
write_buffer = WriteBuffer(tape_path, append_mode=True)
|
|
write_buffer.write_next(new_record)
|
|
write_buffer.flush()
|
|
print(f"[ :) ] Dopisano podany rekord na taśmę")
|
|
case ["sort", tape_path, *options]:
|
|
print(f"[ .. ] Sorting tape {tape_path}")
|
|
print(f"[ -v ] Displaying tape before sorting:")
|
|
print_tape(tape_path)
|
|
if "v" in options:
|
|
sort_info = tape_sort(tape_path, print_after_phase=True)
|
|
else:
|
|
sort_info = tape_sort(tape_path)
|
|
print(f"[ -v ] Displaying tape after sorting:")
|
|
print_tape(tape_path)
|
|
print(f"[ :) ] Tape {tape_path} sorted!")
|
|
print(f"[ -v ] Sorting metadata:")
|
|
print(f"[ .. ] Phase count: {sort_info.phases_count}")
|
|
print(f"[ .. ] Reads count: {sort_info.reads_count}")
|
|
print(f"[ .. ] Writes count: {sort_info.writes_count}")
|
|
case ["load", tape_path, test_file_path]:
|
|
wb = WriteBuffer(tape_path, append_mode=True)
|
|
count = 0
|
|
with open(test_file_path) as test_file:
|
|
for line in test_file:
|
|
set_numbers = [int(s) for s in line.rstrip().split()]
|
|
new_record = Record(set_numbers)
|
|
wb.write_next(new_record)
|
|
count += 1
|
|
wb.flush()
|
|
print(f"[ :) ] Added {count} records to tape")
|
|
case ["help"]:
|
|
print(help_page)
|
|
case ["exit"]:
|
|
print("[ :) ] Goodbye!")
|
|
should_run = False
|
|
case _:
|
|
print("[ :( ] I don't know such a command")
|
|
|