Optimized the functions.

51d4a6fd · Haj Rezvan · 88bb2e3d · 51d4a6fd · 51d4a6fd
Commit 51d4a6fd authored May 23, 2022 by Haj Rezvan
Show whitespace changes
Inline Side-by-side

Showing with 40 additions and 33 deletions

index_maker.py index_maker.py +27 -23

split_document.py split_document.py +13 -10

No files found.
--- a/index_maker.py
+++ b/index_maker.py
 import json
 import os
-import threading
 def logs(cnt, i):
@@ -14,15 +13,15 @@ def index():
    index_file = open(f"./index/ii.json", "w", encoding="utf-8")
    invert_index = dict()
    os.chdir("./document tokens")
-    for tk in os.listdir():
+    files = os.listdir()
-        print(tk)
+    counter = 0
+    pre_percent = 0
+    print("0% is Index made!")
+    for tk in files:
        token_file = open(f"./{tk}", "r", encoding="utf-8")
-        backup = token_file.readline()
        try:
            tkn = json.load(token_file)
-        except (json.decoder.JSONDecodeError, Exception) as e:
+            # backup = token_file.readline()
-            thread = threading.Thread(target=logs, args=(backup, tk,))
-            thread.start()
            tkn = dict(tkn)
            tk = tk.replace(".json", "")
            tk = int(tk)
@@ -36,8 +35,13 @@ def index():
                    indexes.sort()
            token_file.close()
-    # str_out = str(invert_index)
+            new_percent = int((counter / len(files)) * 100)
-    # str_out = str_out.replace("\'", "\"")
+            if new_percent != pre_percent:
-    # str_out = dict(str_out)
+                print(f"{new_percent}% is Index made!")
+                pre_percent = new_percent
+            counter = counter + 1
+        except Exception as e:
+            print(f"Exception in file {tk}\n{e.args}\n")
    json.dump(invert_index, index_file)
    print("Invert index made!")
--- a/split_document.py
+++ b/split_document.py
@@ -38,17 +38,20 @@ def __retrieval(data, i, tag):
        return obj_cnt
    except Exception as ignore:
        print(f"We have a Exception!! {ignore.with_traceback}")
+        pass
 def __processor(file):
    data = json.load(file)
+    length = len(data)
-    for i in tqdm(range(14000), desc="Splatted: "):
+    doc_ids = list(data.keys())
-        title = __retrieval(data, i, "title")
+    for i in tqdm(range(length), desc="Splatted: "):
-        content = __retrieval(data, i, "content")
+        doc_id = doc_ids[i]
-        url = __retrieval(data, i, "url")
+        title = __retrieval(data, doc_id, "title")
-        tags = __retrieval(data, i, "tags")
+        content = __retrieval(data, doc_id, "content")
-        date = __retrieval(data, i, "date")
+        url = __retrieval(data, doc_id, "url")
-        category = __retrieval(data, i, "category")
+        tags = __retrieval(data, doc_id, "tags")
+        date = __retrieval(data, doc_id, "date")
-        __writer(i, title, content, url, tags, date, category)
+        category = __retrieval(data, doc_id, "category")
+        __writer(doc_id, title, content, url, tags, date, category)