Optimized the functions.

e3f40329 · Haj Rezvan · bc700933 · e3f40329 · e3f40329 · e3f40329
Commit e3f40329 authored Jun 25, 2022 by Haj Rezvan
Hide whitespace changes
Inline Side-by-side

Showing with 143 additions and 45 deletions

QP.py QP.py +116 -31

main.py main.py +4 -4

index.html templates/index.html +2 -2

tokenizer.py tokenizer.py +21 -8

No files found.
--- a/QP.py
+++ b/QP.py
 import json
+import os
 import threading


@@ -71,7 +72,6 @@ def operations(inp):
    return flag


-# Get dictionary of docIDs and return the title of most relevant.
 def get_info(inp):
    result = list()
    if type(inp) == dict:
@@ -95,53 +95,138 @@ def get_info(inp):
    return out


-def write_logs(string):
+# Get dictionary of docIDs and return the title of most relevant.
+def __write_logs(string):
    file = open(f"./logs/log.txt", "a", encoding="utf-8")
    file.write(string + "\n")
    file.close()


+def __intersection(i: int, return_list: list, selected: list):
+    if i == 0:
+        for doc in range(0, len(selected)):
+            return_list.append(selected[doc])
+    else:
+        counter = 0
+        while counter < len(return_list):
+            if not selected.__contains__(return_list[counter]):
+                return_list.remove(return_list[counter])
+                counter = counter - 1
+            counter = counter + 1
+    return return_list
+
+
+def __double_quotation(i: int, query: list, dictionary: dict, finish: bool, doc_id: list):
+    length = len(query[i])
+    query[i] = query[i][1:length]  # length of phrase query.
+    if dictionary.keys().__contains__(query[i]):
+        while True:  # Find end of "
+            selected = list()
+            length = len(query[i + 1])
+            if query[i][length - 1] == '"':
+                query[i + 1] = query[i + 1][0:length - 1]
+                finish = True
+
+            #   Find docID
+            for aP in range(0, len(dictionary[query[i]])):
+                # document ID
+                doc = list(dictionary[query[i]].keys())[aP]  # Number of document in dictionary.
+                if dictionary[query[i + 1]].keys().__contains__(doc):
+
+                    # Array of this word in the query.
+                    presentPointer = dictionary[query[i]][doc]
+
+                    # Array of next word in query.
+                    nextPointer = list()
+                    for bP in range(0, len(dictionary[query[i + 1]][doc])):  # Iterate from end to begin.
+                        nextPointer.append(dictionary[query[i + 1]][doc].__getitem__(bP) - 1)
+
+                    # Position of documents.
+                    for p in dictionary[query[i]][doc]:
+                        if nextPointer.__contains__(dictionary[query[i]][doc].__getitem__(p)):
+                            selected.append(doc)
+                            break
+            # intersect of documents.
+            doc_id = __intersection(i, doc_id, selected)
+            i = i + 1
+            if finish:
+                return doc_id
+    else:
+        return list()
+
+
+def __not(i: int, query: list, dictionary: dict, doc_id: list):
+    global file_numbers
+    selected = list()
+    length = len(query[i])
+    query[i] = query[i][1:length]  # length of phrase query.
+    if dictionary.keys().__contains__(query[i]):
+        for term in range(0, file_numbers):
+            if not dictionary.keys().__contains__(query[term]):
+                selected.append(term)
+
+    else:  # Not in dictionary.
+        for pointer in range(0, file_numbers):
+            selected.append(pointer)
+
+    doc_id = __intersection(i, doc_id, selected)
+    return doc_id
+
+
+def __file_number():
+    global file_numbers
+    os.chdir("./docs")
+    file_numbers = len(os.listdir())
+
+
+def __simple_check(i: int, query: list, dictionary: dict, doc_id: list):
+    global file_numbers
+    selected = list()
+    if dictionary.keys().__contains__(query[i]):
+        for term in range(file_numbers):
+            if dictionary[query[i]].keys().__contains__(term):
+                selected.append(term)
+        doc_id = __intersection(i, doc_id, selected)
+    else:
+        doc_id = list()
+    return doc_id
+
+
+file_numbers = 0
+
+
+def __checker(query: list, dictionary: dict):
+    finish = False
+    i = 0  # For getting index of words in dictionary
+    content = list()
+    while i < len(query):
+        if query[i][0] == '"':
+            content = __double_quotation(i, query, dictionary, finish, content)
+
+        elif query[i][0] == '!':
+            content = __not(i, query, dictionary, content)
+        else:
+            content = __simple_check(i, query, dictionary, content)
+        i = i + 1
+    return content
+
+
 def enter(it):
-    t1 = threading.Thread(target=write_logs, args=(it,))
+    t1 = threading.Thread(target=__write_logs, args=(it,))
    t1.start()
+    t2 = threading.Thread(target=__file_number, args=())
+    t2.start()
    spl = list(it.split(" "))
    file = open("./index/ii.json", "r", encoding="utf-8")
    index = json.load(file)
    dictionary = dict(index)
-    rs = []
-    for word in spl:
-        if word in dictionary.keys():
-            rs.append(word)
+    rs = __checker(it, dictionary)

    ld = dict()
    for i in range(len(rs)):
        ld[rs[i]] = index.get(rs[i])
        print(ld[rs[i]])

-    ld_copy = ld.copy()
-    opt = list()
-    if len(rs) > 1:
-        flag = operations(spl)
-        while len(flag) > 0:
-            if "&" in flag:
-                _and = spl.index("AND")
-                nxt_word = spl[_and + 1]
-                prv_word = spl[_and - 1]
-                opt.extend(intersect(ld[nxt_word], ld[prv_word]))
-                spl.pop(_and)
-                ld.pop(nxt_word)
-                ld.pop(prv_word)
-                ld["opt"] = opt
-                flag = operations(spl)
-            elif "!" in flag:
-                _not = spl.index("NOT")
-                nxt_word = spl[_not + 1]
-                prv_word = spl[_not - 1]
-                opt = subtract(ld[prv_word], ld[nxt_word])
-                print(opt)
-                spl.pop(_not)
-                flag = operations(spl)
-
    out_data = get_info(ld)
    t1.join()
    return out_data
--- a/main.py
+++ b/main.py
@@ -25,15 +25,15 @@ if __name__ == '__main__':
    thread = threading.Thread(target=stop_word())
    thread.run()

-    split_document.file_open()  # Main Splitter for all of news.
+    # split_document.file_open()  # Main Splitter for all of news.

-    # split_document.file_open_test()   # Splitter for test dataset.
+    split_document.file_open_test()   # Splitter for test dataset.

    files = __file_finder()
    os.chdir("..")

-    tokenizer.get_file(files)  # Main tokenizer.
+    # tokenizer.get_file(files)  # Main tokenizer.

-    # tokenizer.get_file_test(files)  # Tokenizer in test dataset.
+    tokenizer.get_file_test(files)  # Tokenizer in test dataset.

    index_maker.index()
--- a/templates/index.html
+++ b/templates/index.html
@@ -68,9 +68,9 @@

    {% for row in data %}
    <div class="box">
-        <span style="font-size: 15px">{{ row[3] }}</span>
+        <span style="font-size: 15px">{{ row[5] }}</span>
        <br/>
-        <a href={{ row[3] }} style="font-family: 'B Titr'">{{ row[1] }}</a>
+        <a href={{ row[5] }} style="font-family: 'B Titr'">{{ row[1] }}</a>
        <p style="font-family: 'B Nazanin'">
            {{ row[2] }}
        </p>

--- a/tokenizer.py
+++ b/tokenizer.py
@@ -8,7 +8,7 @@ global stop_words

 def debugger(string):
    string = string.replace("\'", "\"")
-    string = string.replace("\"\"\"", "\"\"")
+    # string = string.replace("\"\"\"", "\"\"")
    string = string.replace("\\", " ")
    string = string.replace("\u200c", " ")
    return string
@@ -24,7 +24,8 @@ def normalize(tokens):
    return tokens


-def token_maker(filename):
+def token_maker(filename: str):
+    counter = int(filename.replace(".json", ""))
    filepath = os.path.join("./docs", filename)
    file = open(filepath, "r")
    obj = json.load(file)
@@ -39,17 +40,29 @@ def token_maker(filename):
    normal_txt = normalizer.normalize(obj_cnt)
    tokens = pars_tokenizer.tokenize_words(normal_txt)
    output = dict()
-    index = 0
+    position = 0
    tokens = normalize(tokens)

+    # word = tokens[position]
    for word in tokens:
        word = my_stemmer.convert_to_stem(word)
-        if word not in output.keys():
-            output[str(word)] = [index]
+        if word.__contains__('&'):
+            x = word.index('&')
+            word = word[0:x]
+
+        if not output.keys().__contains__(word):  # Create Postings list
+            output.update(
+                {str(word): {str(counter): [position]}
+                 }
+            )
        else:
-            indexes = output[word]
-            indexes.append(index)
-        index = index + 1
+            if output[word].keys().__contains__(position):
+                output[word][str(counter)].append(position)
+            else:
+                output[word].update(
+                    {str(counter): [position]}
+                )
+        position = position + 1

    filename = str(filename)
    tk_fl = open(f"document tokens/{filename}", "w", encoding="utf-8")