Commit 51d4a6fd authored by Haj Rezvan's avatar Haj Rezvan

Optimized the functions.

parent 88bb2e3d
import json
import os
import threading
def logs(cnt, i):
......@@ -14,30 +13,35 @@ def index():
index_file = open(f"./index/ii.json", "w", encoding="utf-8")
invert_index = dict()
os.chdir("./document tokens")
for tk in os.listdir():
print(tk)
files = os.listdir()
counter = 0
pre_percent = 0
print("0% is Index made!")
for tk in files:
token_file = open(f"./{tk}", "r", encoding="utf-8")
backup = token_file.readline()
try:
tkn = json.load(token_file)
except (json.decoder.JSONDecodeError, Exception) as e:
thread = threading.Thread(target=logs, args=(backup, tk,))
thread.start()
tkn = dict(tkn)
tk = tk.replace(".json", "")
tk = int(tk)
for key in tkn.keys():
if key not in invert_index.keys():
invert_index[str(key)] = [tk]
else:
indexes = invert_index[key]
indexes.append(tk)
indexes.sort()
token_file.close()
# str_out = str(invert_index)
# str_out = str_out.replace("\'", "\"")
# str_out = dict(str_out)
# backup = token_file.readline()
tkn = dict(tkn)
tk = tk.replace(".json", "")
tk = int(tk)
for key in tkn.keys():
if key not in invert_index.keys():
invert_index[str(key)] = [tk]
else:
indexes = invert_index[key]
indexes.append(tk)
indexes.sort()
token_file.close()
new_percent = int((counter / len(files)) * 100)
if new_percent != pre_percent:
print(f"{new_percent}% is Index made!")
pre_percent = new_percent
counter = counter + 1
except Exception as e:
print(f"Exception in file {tk}\n{e.args}\n")
json.dump(invert_index, index_file)
print("Invert index made!")
......@@ -38,17 +38,20 @@ def __retrieval(data, i, tag):
return obj_cnt
except Exception as ignore:
print(f"We have a Exception!! {ignore.with_traceback}")
pass
def __processor(file):
data = json.load(file)
for i in tqdm(range(14000), desc="Splatted: "):
title = __retrieval(data, i, "title")
content = __retrieval(data, i, "content")
url = __retrieval(data, i, "url")
tags = __retrieval(data, i, "tags")
date = __retrieval(data, i, "date")
category = __retrieval(data, i, "category")
__writer(i, title, content, url, tags, date, category)
length = len(data)
doc_ids = list(data.keys())
for i in tqdm(range(length), desc="Splatted: "):
doc_id = doc_ids[i]
title = __retrieval(data, doc_id, "title")
content = __retrieval(data, doc_id, "content")
url = __retrieval(data, doc_id, "url")
tags = __retrieval(data, doc_id, "tags")
date = __retrieval(data, doc_id, "date")
category = __retrieval(data, doc_id, "category")
__writer(doc_id, title, content, url, tags, date, category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment