commit 8b532a39bae66225a63d003a56a1559b5baf8935 Author: Maciej Krzyżanowski Date: Wed Mar 6 20:24:11 2024 +0100 Initial commit, project as it was submitted diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8adedf8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +fs/t1 +fs/t2 +fs/t3 diff --git a/dump.sh b/dump.sh new file mode 100755 index 0000000..a66c534 --- /dev/null +++ b/dump.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +echo "T1:" +od fs/t1 -t x1 +echo "T2:" +od fs/t2 -t x1 +echo "T3:" +od fs/t3 -t x1 diff --git a/fs/testfile b/fs/testfile new file mode 100644 index 0000000..f692b57 --- /dev/null +++ b/fs/testfile @@ -0,0 +1,8 @@ +10 9 8 7 +9 8 7 6 +12 11 10 9 +8 7 6 5 +15 14 13 12 +14 13 12 11 +18 17 16 15 +13 12 11 10 diff --git a/naturalmerge.py b/naturalmerge.py new file mode 100644 index 0000000..514756a --- /dev/null +++ b/naturalmerge.py @@ -0,0 +1,472 @@ +import os +import itertools +import math +from random import randint + +BUFFER_SIZE = 32 +SET_BYTES_SIZE = 15 +RECORD_BYTES_SIZE = SET_BYTES_SIZE + 1 +BYTES_BUFFER_SIZE = BUFFER_SIZE * RECORD_BYTES_SIZE + + +class ReadBuffer: + def __init__(self, file_path): + self.read_pos = 0 + self.size = BUFFER_SIZE + self.loaded_size = 0 + self.buffer = [] + self.file_path = file_path + self.file_pos = 0 + self.file_size = os.path.getsize(file_path) + self.disk_reads_count = 0 + self.load_next() + + # None if there is no next record + def read_next(self): + if not self.has_more(): + return None + + res_record = self.buffer[self.read_pos] + self.read_pos += 1 + if self.read_pos == self.size: + self.load_next() + self.read_pos = 0 + + return res_record + + + def has_more(self): + return (self.file_pos < self.file_size + or self.read_pos < self.loaded_size) + + def peek(self): + if self.read_pos == self.loaded_size: + return None + result = self.buffer[self.read_pos] + return result + + def load_next(self): + self.buffer = [] + # buffering=0 disables buffering, it is desired, because buffering + # is implemented here, in code + file = open(self.file_path, "rb", buffering=0) + file.seek(self.file_pos) + bytes_to_read = min( + BYTES_BUFFER_SIZE, + self.file_size - self.file_pos + ) + temp_buffer = file.read(bytes_to_read) + + if len(temp_buffer) % RECORD_BYTES_SIZE != 0: + raise Exception("Read bytes are not multiply of record size") + + self.file_pos += bytes_to_read + self.loaded_size = bytes_to_read / RECORD_BYTES_SIZE + + temp_ints = list(temp_buffer) + + for i in range(len(temp_buffer) // RECORD_BYTES_SIZE): + record_ints = temp_ints[ + RECORD_BYTES_SIZE * i:RECORD_BYTES_SIZE * (i + 1) + ] + self.buffer.append(Record.load_from_ints(record_ints)) + + file.close() + self.disk_reads_count += 1 + + def __iter__(self): + return self + + def __next__(self): + next_record = self.read_next() + if next_record is None: + raise StopIteration + return next_record + + +class WriteBuffer: + def __init__(self, file_path, append_mode=False): + self.write_pos = 0 + self.size = BUFFER_SIZE + self.buffer = [None] * BUFFER_SIZE + self.file_path = file_path + if not append_mode and os.path.isfile(file_path): + os.remove(file_path) + self.runs_written = 0 + self.last_written = None + self.disk_writes_count = 0 + + def write_next(self, record): + if record < self.last_written: + self.runs_written += 1 + if self.write_pos == self.size: + self.flush() + self.buffer[self.write_pos] = record + self.write_pos += 1 + self.last_written = record + + def save_next(self): + ints_to_write = [] + for record in self.buffer[0:self.write_pos]: + ints_to_write += record.save_to_ints() # type: ignore + + file = open(self.file_path, "ab", buffering=0) + file.write(bytearray(ints_to_write)) + file.close() + self.disk_writes_count += 1 + + def flush(self): + if self.write_pos > 0: + self.save_next() + self.write_pos = 0 + + +class Record: + def __init__(self, items): + self.items = items + + @staticmethod + def load_from_ints(record_ints): + set_length = record_ints[0] + set_items = record_ints[1:set_length + 1] + return Record(set_items) + + def save_to_ints(self): + result = [len(self.items), *self.items] + zeropad = [0] * (RECORD_BYTES_SIZE - len(result)) + result += zeropad + return result + + def __repr__(self): + return f"Zbiór {sorted(self.items, reverse=True)}" + + def __lt__(self, other): + if other is None: + return True + + self_items_copy = self.items[:] + other_items_copy = other.items[:] + + for item in self_items_copy: + if item in other_items_copy: + self_items_copy.remove(item) + other_items_copy.remove(item) + + if len(other_items_copy) == 0: + return False + elif len(self_items_copy) == 0: + return True + + s_max = max(self_items_copy) + o_max = max(other_items_copy) + + return o_max > s_max + + +class RunIterator: + def __init__(self, read_buffer): + self.read_buffer = read_buffer + self.current_record = None + self.end_of_run = False + + def read_next(self): + if self.end_of_run: + return None + + self.current_record = self.read_buffer.read_next() + + if self.current_record is None: + return None + + next_record = self.read_buffer.peek() + if next_record is not None and next_record < self.current_record: + self.end_of_run = True + + return self.current_record + + def __iter__(self): + return self + + def __next__(self): + res_record = self.read_next() + if res_record is None: + raise StopIteration + return res_record + + +def print_tape(file_name): + print(f"[ .. ] Taśma {file_name}\n") + + buffer = ReadBuffer(file_name) + series_count = 0 + records_count = 0 + + while buffer.has_more(): + ri = RunIterator(buffer) + for record in ri: + print(record) + records_count += 1 + series_count += 1 + print("~ koniec biegu ~") + + print(f"\n[ ^- ] Liczba biegów: {series_count}") + print(f"[ ^- ] Liczba rekordów: {records_count}") + + +def print_runs(file_name, n): + print(f"Printing first {n} runs from {file_name}") + buff = ReadBuffer(file_name) + for i in range(n): + print(f"\nRun {i}:") + ri = RunIterator(buff) + for record in ri: + print(record) + + +def runs_count(file_name): + rc = 0 + buff = ReadBuffer(file_name) + while buff.has_more(): + ri = RunIterator(buff) + for _ in ri: + pass + rc += 1 + return rc + + +# ============================================================================== + +def prepare_tapes(): + t1_dest = WriteBuffer("fs/t1") + for record in ReadBuffer("fs/start_tape"): + t1_dest.write_next(record) + t1_dest.flush() + + if os.path.isfile("fs/t2"): + os.remove("fs/t2") + if os.path.isfile("fs/t3"): + os.remove("fs/t3") + + +class MetaInfo: + def __init__(self, reads_count, writes_count, runs_count): + self.reads_count = reads_count + self.writes_count = writes_count + self.runs_count = runs_count + + +def distribute(source_tape_path, first_dest_path, second_dest_path): + t1_buffer = ReadBuffer(source_tape_path) + t2_buffer = WriteBuffer(first_dest_path) + t3_buffer = WriteBuffer(second_dest_path) + + last_record = t1_buffer.read_next() + t2_buffer.write_next(last_record) + dest_buffer = t2_buffer + + i = 0 + for record in t1_buffer: + # Not sorted pair of records + if record < last_record: + # Toggle + if dest_buffer == t2_buffer: + dest_buffer = t3_buffer + else: + dest_buffer = t2_buffer + i += 1 + + if dest_buffer == t2_buffer: + t2_buffer.write_next(record) + else: + t3_buffer.write_next(record) + + last_record = record + + t2_buffer.flush() + t3_buffer.flush() + + return MetaInfo(t1_buffer.disk_reads_count, + t2_buffer.disk_writes_count + t3_buffer.disk_writes_count, + t2_buffer.runs_written + t3_buffer.runs_written) + + +# Read runs alternately from t2 and t3 (reading is done 1 record at a time, because we can't read whole run to memory) +# So one run may end and if it happens we need to write remaining records from non-empty run +# If any of t2 or t3 ends then just write all remaining runs from non-empty tape to t1 +# For each two runs merge their records creating new run +# Write that run to t1 + +def merge_runs(rit1, rit2, write_buffer: WriteBuffer): + rit1_curr = rit1.read_next() + rit2_curr = rit2.read_next() + while rit1_curr is not None and rit2_curr is not None: + if rit1_curr < rit2_curr: + write_buffer.write_next(rit1_curr) + rit1_curr = rit1.read_next() + else: + write_buffer.write_next(rit2_curr) + rit2_curr = rit2.read_next() + if rit1_curr is not None: + write_buffer.write_next(rit1_curr) + for r in rit1: + write_buffer.write_next(r) + if rit2_curr is not None: + write_buffer.write_next(rit2_curr) + for r in rit2: + write_buffer.write_next(r) + + +def merge(first_source_path, second_source_path, dest_tape_path): + t1_buffer = WriteBuffer(dest_tape_path) + t2_buffer = ReadBuffer(first_source_path) + t3_buffer = ReadBuffer(second_source_path) + + while t2_buffer.has_more() and t3_buffer.has_more(): + merge_runs(RunIterator(t2_buffer), RunIterator(t3_buffer), t1_buffer) + + for r in t2_buffer: + t1_buffer.write_next(r) + + for r in t3_buffer: + t1_buffer.write_next(r) + + t1_buffer.flush() + return MetaInfo(t2_buffer.disk_reads_count + t3_buffer.disk_reads_count, + t1_buffer.disk_writes_count, + t1_buffer.runs_written) + + +class SortInfo: + def __init__(self, reads_count, writes_count, phases_count): + self.reads_count = reads_count + self.writes_count = writes_count + self.phases_count = phases_count + + +def tape_sort(tape_path, print_after_phase=False): + runs_written = 0 + phases_count = 0 + reads_count = 0 + writes_count = 0 + while runs_written != 1: + dist_info = distribute(tape_path, "fs/t2", "fs/t3") + merge_info = merge("fs/t2", "fs/t3", tape_path) + runs_written = merge_info.runs_count + + reads_count += dist_info.reads_count + reads_count += merge_info.reads_count + writes_count += dist_info.writes_count + writes_count += merge_info.writes_count + + if print_after_phase: + print(f"[ -v ] Faza {phases_count + 1}") + print_tape(tape_path) + print(f"[ .. ] {runs_written} biegów pozostało") + phases_count += 1 + + return SortInfo(reads_count, writes_count, phases_count) + +help_page = """ +pomoc + wyświetla tę stronę +wyczyść <ścieżka_do_taśmy> + usuwa taśmę +genlos <ścieżka_do_taśmy> [opcje] + dopisuje losowo wygenerowane rekordy na taśmę + jeżeli jako opcja podane zostanie 'o', wówczas taśma zostanie nadpisana + nowowygenerowanymi rekordami +dopisz <ścieżka_do_taśmy> + dopisuje nowy rekord na koniec taśmy, jako + należy podać co najmniej 1 liczbę, a + maksymalnie 15, każdą z zakresu 0-255 +wczytaj <ścieżka_do_taśmy> <ścieżka_do_pliku> + wczytuje rekordy z podanego pliku testowego na taśmę +sortuj <ścieżka_do_taśmy> [opcje] + sortuje podaną taśmę wypisując jej zawartość na + początku i na końcu operacji. + Gdy zostanie podana opcja 'v', wówczas taśma będzie + wyświetlana po każdej z faz +wyświetl <ścieżka_do_taśmy> + wyświetla zawartość taśmy i jej metadane m.in.: + liczbę biegów (serii) + liczbę rekordów +""" + +print("\nProjekt SBD - sortowanie metodą scalania naturalnego (2+1)") +print("Autor: Maciej Krzyżanowski [188872]\n") +should_run = True +while should_run: + cmd_line = input("> ") + match cmd_line.split(): + case ["wyczyść", tape_path]: + print(f"[ .. ] Czyszczenie taśmy {tape_path}") + if os.path.isfile(tape_path): + os.remove(tape_path) + print(f"[ :) ] Wyczyszczono taśmę") + else: + print(f"[ :( ] Taśma o podanej ścieżce nie istnieje") + case ["genlos", tape_path, number_of_records, *options]: + if "o" in options: + write_buffer = WriteBuffer(tape_path) + else: + write_buffer = WriteBuffer( + tape_path, append_mode=True) + for i in range(int(number_of_records)): + set_length = randint(1, 15) + new_set = [] + while len(new_set) != set_length: + new_suggestion = randint(0, 255) + if new_suggestion not in new_set: + new_set.append(new_suggestion) + new_record = Record(new_set) + write_buffer.write_next(new_record) + write_buffer.flush() + print(f"[ :) ] Dopisano {number_of_records} nowych rekordów do " + + f"taśmy {tape_path}") + case ["wyświetl", tape_path]: + print(f"[ :) ] Wyświetlam taśmę {tape_path}") + print_tape(tape_path) + case ["dopisz", tape_path, *set_elements]: + if len(set_elements) == 0: + print("[ :( ] Nie podano ani jednego rekordu") + continue + set_elements = [int(x) for x in set_elements] + new_record = Record(set_elements) + write_buffer = WriteBuffer(tape_path, append_mode=True) + write_buffer.write_next(new_record) + write_buffer.flush() + print(f"[ :) ] Dopisano podany rekord na taśmę") + case ["sortuj", tape_path, *options]: + print(f"[ .. ] Sortowanie taśmy {tape_path}") + print(f"[ -v ] Wyświetlam taśmę przed posortowaniem:") + print_tape(tape_path) + if "v" in options: + sort_info = tape_sort(tape_path, print_after_phase=True) + else: + sort_info = tape_sort(tape_path) + print(f"[ -v ] Wyświetlam taśmę po posortowaniu:") + print_tape(tape_path) + print(f"[ :) ] Taśma {tape_path} posortowana!") + print(f"[ -v ] Metadane sortowania") + print(f"[ .. ] Liczba faz {sort_info.phases_count}") + print(f"[ .. ] Liczba odczytów {sort_info.reads_count}") + print(f"[ .. ] Liczba zapisów {sort_info.writes_count}") + case ["wczytaj", tape_path, test_file_path]: + wb = WriteBuffer(tape_path, append_mode=True) + count = 0 + with open(test_file_path) as test_file: + for line in test_file: + set_numbers = [int(s) for s in line.rstrip().split()] + new_record = Record(set_numbers) + wb.write_next(new_record) + count += 1 + wb.flush() + print(f"[ :) ] Dopisano {count} rekordów na taśmę") + case ["pomoc"]: + print(help_page) + case ["wyjście"]: + print("[ :) ] Do widzenia!") + should_run = False + case _: + print("[ :( ] Nie znam takiej komendy") +