import os import itertools import math from random import randint BUFFER_SIZE = 32 SET_BYTES_SIZE = 15 RECORD_BYTES_SIZE = SET_BYTES_SIZE + 1 BYTES_BUFFER_SIZE = BUFFER_SIZE * RECORD_BYTES_SIZE class ReadBuffer: def __init__(self, file_path): self.read_pos = 0 self.size = BUFFER_SIZE self.loaded_size = 0 self.buffer = [] self.file_path = file_path self.file_pos = 0 self.file_size = os.path.getsize(file_path) self.disk_reads_count = 0 self.load_next() # None if there is no next record def read_next(self): if not self.has_more(): return None res_record = self.buffer[self.read_pos] self.read_pos += 1 if self.read_pos == self.size: self.load_next() self.read_pos = 0 return res_record def has_more(self): return (self.file_pos < self.file_size or self.read_pos < self.loaded_size) def peek(self): if self.read_pos == self.loaded_size: return None result = self.buffer[self.read_pos] return result def load_next(self): self.buffer = [] # buffering=0 disables buffering, it is desired, because buffering # is implemented here, in code file = open(self.file_path, "rb", buffering=0) file.seek(self.file_pos) bytes_to_read = min( BYTES_BUFFER_SIZE, self.file_size - self.file_pos ) temp_buffer = file.read(bytes_to_read) if len(temp_buffer) % RECORD_BYTES_SIZE != 0: raise Exception("Read bytes are not multiply of record size") self.file_pos += bytes_to_read self.loaded_size = bytes_to_read / RECORD_BYTES_SIZE temp_ints = list(temp_buffer) for i in range(len(temp_buffer) // RECORD_BYTES_SIZE): record_ints = temp_ints[ RECORD_BYTES_SIZE * i:RECORD_BYTES_SIZE * (i + 1) ] self.buffer.append(Record.load_from_ints(record_ints)) file.close() self.disk_reads_count += 1 def __iter__(self): return self def __next__(self): next_record = self.read_next() if next_record is None: raise StopIteration return next_record class WriteBuffer: def __init__(self, file_path, append_mode=False): self.write_pos = 0 self.size = BUFFER_SIZE self.buffer = [None] * BUFFER_SIZE self.file_path = file_path if not append_mode and os.path.isfile(file_path): os.remove(file_path) self.runs_written = 0 self.last_written = None self.disk_writes_count = 0 def write_next(self, record): if record < self.last_written: self.runs_written += 1 if self.write_pos == self.size: self.flush() self.buffer[self.write_pos] = record self.write_pos += 1 self.last_written = record def save_next(self): ints_to_write = [] for record in self.buffer[0:self.write_pos]: ints_to_write += record.save_to_ints() # type: ignore file = open(self.file_path, "ab", buffering=0) file.write(bytearray(ints_to_write)) file.close() self.disk_writes_count += 1 def flush(self): if self.write_pos > 0: self.save_next() self.write_pos = 0 class Record: def __init__(self, items): self.items = items @staticmethod def load_from_ints(record_ints): set_length = record_ints[0] set_items = record_ints[1:set_length + 1] return Record(set_items) def save_to_ints(self): result = [len(self.items), *self.items] zeropad = [0] * (RECORD_BYTES_SIZE - len(result)) result += zeropad return result def __repr__(self): return f"Set {sorted(self.items, reverse=True)}" def __lt__(self, other): if other is None: return True self_items_copy = self.items[:] other_items_copy = other.items[:] for item in self_items_copy: if item in other_items_copy: self_items_copy.remove(item) other_items_copy.remove(item) if len(other_items_copy) == 0: return False elif len(self_items_copy) == 0: return True s_max = max(self_items_copy) o_max = max(other_items_copy) return o_max > s_max class RunIterator: def __init__(self, read_buffer): self.read_buffer = read_buffer self.current_record = None self.end_of_run = False def read_next(self): if self.end_of_run: return None self.current_record = self.read_buffer.read_next() if self.current_record is None: return None next_record = self.read_buffer.peek() if next_record is not None and next_record < self.current_record: self.end_of_run = True return self.current_record def __iter__(self): return self def __next__(self): res_record = self.read_next() if res_record is None: raise StopIteration return res_record def print_tape(file_name): print(f"[ .. ] Tape {file_name}\n") buffer = ReadBuffer(file_name) series_count = 0 records_count = 0 while buffer.has_more(): ri = RunIterator(buffer) for record in ri: print(record) records_count += 1 series_count += 1 print("~ series end ~") print(f"\n[ ^- ] Series count: {series_count}") print(f"[ ^- ] Records count: {records_count}") def print_runs(file_name, n): print(f"Printing first {n} runs from {file_name}") buff = ReadBuffer(file_name) for i in range(n): print(f"\nRun {i}:") ri = RunIterator(buff) for record in ri: print(record) def runs_count(file_name): rc = 0 buff = ReadBuffer(file_name) while buff.has_more(): ri = RunIterator(buff) for _ in ri: pass rc += 1 return rc # ============================================================================== def prepare_tapes(): t1_dest = WriteBuffer("fs/t1") for record in ReadBuffer("fs/start_tape"): t1_dest.write_next(record) t1_dest.flush() if os.path.isfile("fs/t2"): os.remove("fs/t2") if os.path.isfile("fs/t3"): os.remove("fs/t3") class MetaInfo: def __init__(self, reads_count, writes_count, runs_count): self.reads_count = reads_count self.writes_count = writes_count self.runs_count = runs_count def distribute(source_tape_path, first_dest_path, second_dest_path): t1_buffer = ReadBuffer(source_tape_path) t2_buffer = WriteBuffer(first_dest_path) t3_buffer = WriteBuffer(second_dest_path) last_record = t1_buffer.read_next() t2_buffer.write_next(last_record) dest_buffer = t2_buffer i = 0 for record in t1_buffer: # Not sorted pair of records if record < last_record: # Toggle if dest_buffer == t2_buffer: dest_buffer = t3_buffer else: dest_buffer = t2_buffer i += 1 if dest_buffer == t2_buffer: t2_buffer.write_next(record) else: t3_buffer.write_next(record) last_record = record t2_buffer.flush() t3_buffer.flush() return MetaInfo(t1_buffer.disk_reads_count, t2_buffer.disk_writes_count + t3_buffer.disk_writes_count, t2_buffer.runs_written + t3_buffer.runs_written) # Read runs alternately from t2 and t3 (reading is done 1 record at a time, because we can't read whole run to memory) # So one run may end and if it happens we need to write remaining records from non-empty run # If any of t2 or t3 ends then just write all remaining runs from non-empty tape to t1 # For each two runs merge their records creating new run # Write that run to t1 def merge_runs(rit1, rit2, write_buffer: WriteBuffer): rit1_curr = rit1.read_next() rit2_curr = rit2.read_next() while rit1_curr is not None and rit2_curr is not None: if rit1_curr < rit2_curr: write_buffer.write_next(rit1_curr) rit1_curr = rit1.read_next() else: write_buffer.write_next(rit2_curr) rit2_curr = rit2.read_next() if rit1_curr is not None: write_buffer.write_next(rit1_curr) for r in rit1: write_buffer.write_next(r) if rit2_curr is not None: write_buffer.write_next(rit2_curr) for r in rit2: write_buffer.write_next(r) def merge(first_source_path, second_source_path, dest_tape_path): t1_buffer = WriteBuffer(dest_tape_path) t2_buffer = ReadBuffer(first_source_path) t3_buffer = ReadBuffer(second_source_path) while t2_buffer.has_more() and t3_buffer.has_more(): merge_runs(RunIterator(t2_buffer), RunIterator(t3_buffer), t1_buffer) for r in t2_buffer: t1_buffer.write_next(r) for r in t3_buffer: t1_buffer.write_next(r) t1_buffer.flush() return MetaInfo(t2_buffer.disk_reads_count + t3_buffer.disk_reads_count, t1_buffer.disk_writes_count, t1_buffer.runs_written) class SortInfo: def __init__(self, reads_count, writes_count, phases_count): self.reads_count = reads_count self.writes_count = writes_count self.phases_count = phases_count def tape_sort(tape_path, print_after_phase=False): runs_written = 0 phases_count = 0 reads_count = 0 writes_count = 0 while runs_written != 1: dist_info = distribute(tape_path, "fs/t2", "fs/t3") merge_info = merge("fs/t2", "fs/t3", tape_path) runs_written = merge_info.runs_count reads_count += dist_info.reads_count reads_count += merge_info.reads_count writes_count += dist_info.writes_count writes_count += merge_info.writes_count if print_after_phase: print(f"[ -v ] Phase {phases_count + 1}") print_tape(tape_path) print(f"[ .. ] {runs_written} series remaining") phases_count += 1 return SortInfo(reads_count, writes_count, phases_count) help_page = """ help displays the help page clear clears (removes) the tape genrandom [options] adds record_count random records at the end of the tape if 'o' is provided as an option then tape is overwritten with these newly generated random records add adds new record (set created from 1 to 15 numbers provided in range 0-255) at the end of the tape load adds the records described in test file to the tape sort [options] sorts the provided tape displaying its contents before and after sorting when option 'v' is provided then tape will be displayed after each phase display displays the tape's records and information how many series and records it contains. Also when displaying records it is shown when each of series ends exit gracefully exits """ print("\nSorting using natural merge method (2+1)") print("Author: Maciej Krzyżanowski\n") should_run = True while should_run: cmd_line = input("> ") match cmd_line.split(): case ["clear", tape_path]: print(f"[ .. ] Clearing a tape {tape_path}") if os.path.isfile(tape_path): os.remove(tape_path) print(f"[ :) ] Cleared the tape") else: print(f"[ :( ] There does not exist a tape at provided path") case ["genrandom", tape_path, number_of_records, *options]: if "o" in options: write_buffer = WriteBuffer(tape_path) else: write_buffer = WriteBuffer( tape_path, append_mode=True) for i in range(int(number_of_records)): set_length = randint(1, 15) new_set = [] while len(new_set) != set_length: new_suggestion = randint(0, 255) if new_suggestion not in new_set: new_set.append(new_suggestion) new_record = Record(new_set) write_buffer.write_next(new_record) write_buffer.flush() print(f"[ :) ] Added {number_of_records} new records to " + f"tape {tape_path}") case ["display", tape_path]: print(f"[ :) ] Displaying tape {tape_path}") print_tape(tape_path) case ["add", tape_path, *set_elements]: if len(set_elements) == 0: print("[ :( ] No number was given") continue set_elements = [int(x) for x in set_elements] new_record = Record(set_elements) write_buffer = WriteBuffer(tape_path, append_mode=True) write_buffer.write_next(new_record) write_buffer.flush() print(f"[ :) ] Dopisano podany rekord na taśmę") case ["sort", tape_path, *options]: print(f"[ .. ] Sorting tape {tape_path}") print(f"[ -v ] Displaying tape before sorting:") print_tape(tape_path) if "v" in options: sort_info = tape_sort(tape_path, print_after_phase=True) else: sort_info = tape_sort(tape_path) print(f"[ -v ] Displaying tape after sorting:") print_tape(tape_path) print(f"[ :) ] Tape {tape_path} sorted!") print(f"[ -v ] Sorting metadata:") print(f"[ .. ] Phase count: {sort_info.phases_count}") print(f"[ .. ] Reads count: {sort_info.reads_count}") print(f"[ .. ] Writes count: {sort_info.writes_count}") case ["load", tape_path, test_file_path]: wb = WriteBuffer(tape_path, append_mode=True) count = 0 with open(test_file_path) as test_file: for line in test_file: set_numbers = [int(s) for s in line.rstrip().split()] new_record = Record(set_numbers) wb.write_next(new_record) count += 1 wb.flush() print(f"[ :) ] Added {count} records to tape") case ["help"]: print(help_page) case ["exit"]: print("[ :) ] Goodbye!") should_run = False case _: print("[ :( ] I don't know such a command")