Optimized the functions.

88bb2e3d · Haj Rezvan · d2249df1 · 88bb2e3d
Commit 88bb2e3d authored May 23, 2022 by Haj Rezvan
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 32 deletions

split_document.py split_document.py +23 -32

No files found.
--- a/split_document.py
+++ b/split_document.py
@@ -13,51 +13,42 @@ def file_open():
    __processor(file)


-def __writer(doc_id, title, content, url):
+def __writer(doc_id, title, content, url, tags, date, category):
    file = open(f"docs/{doc_id}.json", "w")
    dictionary = {
        "doc_id": doc_id,
        "title": title,
        "content": content,
-        "url": url
+        "tags": tags,
+        "date": date,
+        "url": url,
+        "category": category
    }

    json.dump(dictionary, file)


-def __object_clear(lst):
-    lst.clear()
-
-
-def __processor(file):
-    data = json.load(file)
-
-    for i in tqdm(range(14000), desc="Splatted: "):
+def __retrieval(data, i, tag):
+    try:
        obj_cnt = ''
        lst = list()
-
-        try:
-            for characters in data[f"{i}"]["title"]:
+        for characters in data[f"{i}"][tag]:
            lst.append(characters)
            obj_cnt = ''.join(lst)
+        return obj_cnt
+    except Exception as ignore:
+        print(f"We have a Exception!! {ignore.with_traceback}")

-            title = obj_cnt
-            __object_clear(lst)
-
-            for characters in data[f"{i}"]["content"]:
-                lst.append(characters)
-                obj_cnt = ''.join(lst)
-
-            content = obj_cnt
-            __object_clear(lst)
-
-            for characters in data[f"{i}"]["url"]:
-                lst.append(characters)
-                obj_cnt = ''.join(lst)

-            url = obj_cnt
-            __object_clear(lst)
+def __processor(file):
+    data = json.load(file)

-            __writer(i, title, content, url)
-        except Exception as ignore:
-            pass
+    for i in tqdm(range(14000), desc="Splatted: "):
+        title = __retrieval(data, i, "title")
+        content = __retrieval(data, i, "content")
+        url = __retrieval(data, i, "url")
+        tags = __retrieval(data, i, "tags")
+        date = __retrieval(data, i, "date")
+        category = __retrieval(data, i, "category")
+
+        __writer(i, title, content, url, tags, date, category)