Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
F
Farse news IR project
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
9831029
Farse news IR project
Commits
d2249df1
Commit
d2249df1
authored
May 23, 2022
by
Haj Rezvan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Test for phase 2 of IR project.
parent
668cb7e5
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
432 additions
and
46 deletions
+432
-46
debugger.py
debugger.py
+0
-21
index_maker.py
index_maker.py
+14
-1
main.py
main.py
+16
-5
search.png
static/search.png
+0
-0
stop words
stop words
+350
-0
index.html
templates/index.html
+23
-11
tokenizer.py
tokenizer.py
+29
-8
No files found.
debugger.py
deleted
100644 → 0
View file @
668cb7e5
import
os
os
.
chdir
(
"./document tokens"
)
def
function
():
for
file
in
os
.
listdir
():
file_name
=
os
.
path
.
join
(
"./document tokens"
,
file
)
os
.
chdir
(
".."
)
rfl
=
open
(
file_name
,
"r+"
,
encoding
=
"utf-8"
)
string
=
rfl
.
read
()
string
=
string
.
replace
(
"
\"\"\"
"
,
"
\"\"
"
)
string
=
string
.
replace
(
'
\xad
'
,
''
)
string
=
string
.
replace
(
'
\u00ad
'
,
''
)
string
=
string
.
replace
(
'
\N{SOFT HYPHEN}
'
,
''
)
wfl
=
open
(
file_name
,
"w+"
,
encoding
=
"utf-8"
)
wfl
.
write
(
string
)
print
(
f
"File {file} debugged."
)
function
()
index_maker.py
View file @
d2249df1
import
json
import
json
import
os
import
os
import
threading
def
logs
(
cnt
,
i
):
file
=
open
(
f
".{i} JSONDecodeError.log"
,
'w'
,
encoding
=
"utf-8"
)
file
.
write
(
cnt
)
file
.
close
()
def
index
():
def
index
():
...
@@ -8,8 +15,14 @@ def index():
...
@@ -8,8 +15,14 @@ def index():
invert_index
=
dict
()
invert_index
=
dict
()
os
.
chdir
(
"./document tokens"
)
os
.
chdir
(
"./document tokens"
)
for
tk
in
os
.
listdir
():
for
tk
in
os
.
listdir
():
print
(
tk
)
token_file
=
open
(
f
"./{tk}"
,
"r"
,
encoding
=
"utf-8"
)
token_file
=
open
(
f
"./{tk}"
,
"r"
,
encoding
=
"utf-8"
)
backup
=
token_file
.
readline
()
try
:
tkn
=
json
.
load
(
token_file
)
tkn
=
json
.
load
(
token_file
)
except
(
json
.
decoder
.
JSONDecodeError
,
Exception
)
as
e
:
thread
=
threading
.
Thread
(
target
=
logs
,
args
=
(
backup
,
tk
,))
thread
.
start
()
tkn
=
dict
(
tkn
)
tkn
=
dict
(
tkn
)
tk
=
tk
.
replace
(
".json"
,
""
)
tk
=
tk
.
replace
(
".json"
,
""
)
tk
=
int
(
tk
)
tk
=
int
(
tk
)
...
...
main.py
View file @
d2249df1
import
os
import
os
import
threading
import
index_maker
import
index_maker
import
split_document
import
split_document
import
tokenizer
import
tokenizer
from
tests
import
SE
as
t
def
__file_finder
():
def
__file_finder
():
...
@@ -11,13 +11,24 @@ def __file_finder():
...
@@ -11,13 +11,24 @@ def __file_finder():
return
os
.
listdir
()
return
os
.
listdir
()
def
stop_word
():
file
=
open
(
"stop words"
,
"r"
,
encoding
=
"utf8"
)
lst
=
file
.
readlines
()
for
i
in
range
(
len
(
lst
)):
word
=
lst
[
i
]
.
replace
(
"
\n
"
,
""
)
lst
[
i
]
=
word
tokenizer
.
stop_words
=
lst
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
print
(
"بسم الله الرحمن الرحیم"
)
print
(
"بسم الله الرحمن الرحیم"
)
split_document
.
file_open
()
thread
=
threading
.
Thread
(
target
=
stop_word
())
t
.
split
()
thread
.
run
()
# split_document.file_open()
split_document
.
file_open_test
()
files
=
__file_finder
()
files
=
__file_finder
()
os
.
chdir
(
".."
)
os
.
chdir
(
".."
)
tokenizer
.
get_file
(
files
)
#
tokenizer.get_file(files)
t
.
token
(
files
)
t
okenizer
.
get_file_test
(
files
)
os
.
chdir
(
".."
)
os
.
chdir
(
".."
)
index_maker
.
index
()
index_maker
.
index
()
static/search.png
0 → 100644
View file @
d2249df1
472 Bytes
stop words
0 → 100644
View file @
d2249df1
و
در
به
از
كه
مي
اين
است
را
با
هاي
براي
آن
يك
شود
شده
خود
ها
كرد
شد
اي
تا
كند
بر
بود
گفت
نيز
وي
هم
كنند
دارد
ما
كرده
يا
اما
بايد
دو
اند
هر
خواهد
او
مورد
آنها
باشد
ديگر
مردم
نمي
بين
پيش
پس
اگر
همه
صورت
يكي
هستند
بي
من
دهد
هزار
نيست
استفاده
داد
داشته
راه
داشت
چه
همچنين
كردند
داده
بوده
دارند
همين
ميليون
سوي
شوند
بيشتر
بسيار
روي
گرفته
هايي
تواند
اول
نام
هيچ
چند
جديد
بيش
شدن
كردن
كنيم
نشان
حتي
اينكه
ولی
توسط
چنين
برخي
نه
ديروز
دوم
درباره
بعد
مختلف
گيرد
شما
گفته
آنان
بار
طور
گرفت
دهند
گذاري
بسياري
طي
بودند
ميليارد
بدون
تمام
كل
تر
براساس
شدند
ترين
امروز
باشند
ندارد
چون
قابل
گويد
ديگري
همان
خواهند
قبل
آمده
اكنون
تحت
طريق
گيري
جاي
هنوز
چرا
البته
كنيد
سازي
سوم
كنم
بلكه
زير
توانند
ضمن
فقط
بودن
حق
آيد
وقتي
اش
يابد
نخستين
مقابل
خدمات
امسال
تاكنون
مانند
تازه
آورد
فكر
آنچه
نخست
نشده
شايد
چهار
جريان
پنج
ساخته
زيرا
نزديك
برداري
كسي
ريزي
رفت
گردد
مثل
آمد
ام
بهترين
دانست
كمتر
دادن
تمامي
جلوگيري
بيشتري
ايم
ناشي
چيزي
آنكه
بالا
بنابراين
ايشان
بعضي
دادند
داشتند
برخوردار
نخواهد
هنگام
نبايد
غير
نبود
ديده
وگو
داريم
چگونه
بندي
خواست
فوق
ده
نوعي
هستيم
ديگران
همچنان
سراسر
ندارند
گروهي
سعي
روزهاي
آنجا
يكديگر
كردم
بيست
بروز
سپس
رفته
آورده
نمايد
باشيم
گويند
زياد
خويش
همواره
گذاشته
شش
نداشته
شناسي
خواهيم
آباد
داشتن
نظير
همچون
باره
نكرده
شان
سابق
هفت
دانند
جايي
بی
جز
زیرِ
رویِ
سریِ
تویِ
جلویِ
پیشِ
عقبِ
بالایِ
خارجِ
وسطِ
بیرونِ
سویِ
کنارِ
پاعینِ
نزدِ
نزدیکِ
دنبالِ
حدودِ
برابرِ
طبقِ
مانندِ
ضدِّ
هنگامِ
برایِ
مثلِ
بارة
اثرِ
تولِ
علّتِ
سمتِ
عنوانِ
قصدِ
روب
جدا
کی
که
چیست
هست
کجا
کجاست
کَی
چطور
کدام
آیا
مگر
چندین
یک
چیزی
دیگر
کسی
بعری
هیچ
چیز
)
(
،
.
؟
!
@
#
%
$
^
&
*
-
+
=
-
جا
کس
هرگز
یا
تنها
بلکه
خیاه
بله
بلی
آره
آری
مرسی
البتّه
لطفاً
ّه
انکه
وقتیکه
همین
پیش
مدّتی
هنگامی
مان
تان
"
\ No newline at end of file
templates/index.html
View file @
d2249df1
...
@@ -4,7 +4,8 @@
...
@@ -4,7 +4,8 @@
<meta
charset=
"UTF-8"
/>
<meta
charset=
"UTF-8"
/>
<meta
http-equiv=
"X-UA-Compatible"
content=
"IE=edge"
/>
<meta
http-equiv=
"X-UA-Compatible"
content=
"IE=edge"
/>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1.0"
/>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1.0"
/>
<title>
Document
</title>
<link
rel=
"stylesheet"
href=
"https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css"
>
<title>
موتور جست و جوی فارس
</title>
<style
type=
"text/css"
>
<style
type=
"text/css"
>
.box
{
.box
{
width
:
50%
;
width
:
50%
;
...
@@ -33,6 +34,7 @@
...
@@ -33,6 +34,7 @@
<body>
<body>
<div>
<div>
<img
src=
" {{ url_for('static', filename='amirkabir.png') }}"
style=
"width: 7%"
alt=
""
/>
<img
src=
" {{ url_for('static', filename='amirkabir.png') }}"
style=
"width: 7%"
alt=
""
/>
<form
method=
"POST"
action=
"{{ url_for('index') }}"
>
<input
<input
style=
"
style=
"
border-radius: 1rem;
border-radius: 1rem;
...
@@ -40,16 +42,23 @@
...
@@ -40,16 +42,23 @@
height: 2rem;
height: 2rem;
outline: none;
outline: none;
border: 1px solid #8080804a;
border: 1px solid #8080804a;
margin-left: 2rem;
margin-left: 150px;
margin-top: 20px;
position: absolute;
position: absolute;
top: 1rem;
top: 1rem;
box-shadow: 0 2px 5px 1px rgba(64, 60, 67, 0.16);
box-shadow: 0 2px 5px 1px rgba(64, 60, 67, 0.16);
padding: 0 1rem;
padding: 0 1rem;
"
"
type=
"text"
type=
"text"
name=
"
"
name=
"query
"
id=
"
"
id=
"query
"
/>
/>
<input
type=
"image"
style=
"
position: absolute;
margin-left: 910px;
margin-top: -108px;
"
name=
"submit"
src=
"{{ url_for('static', filename='search.png') }}"
/>
</form>
<img
<img
src=
"{{ url_for('static', filename='farsnewslogo.png') }}"
src=
"{{ url_for('static', filename='farsnewslogo.png') }}"
style=
"width: 17%; position: absolute; right: 2rem; top: 1rem"
style=
"width: 17%; position: absolute; right: 2rem; top: 1rem"
...
@@ -59,12 +68,15 @@
...
@@ -59,12 +68,15 @@
{% for row in data %}
{% for row in data %}
<div
class=
"box"
>
<div
class=
"box"
>
<span>
{{ row[3] }}
</span>
<span
style=
"font-size: 15px"
>
{{ row[3] }}
</span>
<br/>
<br/>
<a
href=
{{
row
[
3
]
}}
>
{{ row[1] }}
</a>
<a
href=
{{
row
[
3
]
}}
style=
"font-family: 'B Titr'"
>
{{ row[1] }}
</a>
<p>
<p
style=
"font-family: 'B Nazanin'"
>
{{ row[2] }}
{{ row[2] }}
</p>
</p>
<br/>
<br/>
</div>
</div>
{% endfor %}
{% endfor %}
...
...
tokenizer.py
View file @
d2249df1
import
json
import
json
import
os
import
os
from
parsivar
import
Normalizer
,
Tokenizer
from
parsivar
import
Normalizer
,
Tokenizer
,
FindStems
global
stop_words
def
debugger
(
string
):
def
debugger
(
string
):
string
=
string
.
replace
(
"
\'
"
,
"
\"
"
)
string
=
string
.
replace
(
"
\'
"
,
"
\"
"
)
string
=
string
.
replace
(
"
\"\"\"
"
,
"
\"\"
"
)
string
=
string
.
replace
(
"
\"\"\"
"
,
"
\"\"
"
)
string
=
string
.
replace
(
"
\\
"
,
" "
)
string
=
string
.
replace
(
"
\\
"
,
" "
)
string
=
string
.
replace
(
"
\u200c
"
,
" "
)
return
string
return
string
def
normalize
(
tokens
):
if
type
(
tokens
)
!=
list
:
tokens
=
list
(
tokens
)
for
word
in
tokens
:
if
word
in
stop_words
:
while
word
in
tokens
:
tokens
.
remove
(
word
)
return
tokens
def
token_maker
(
filename
):
def
token_maker
(
filename
):
filepath
=
os
.
path
.
join
(
"./docs"
,
filename
)
filepath
=
os
.
path
.
join
(
"./docs"
,
filename
)
file
=
open
(
filepath
,
"r"
)
file
=
open
(
filepath
,
"r"
)
...
@@ -22,12 +35,15 @@ def token_maker(filename):
...
@@ -22,12 +35,15 @@ def token_maker(filename):
obj_cnt
=
''
.
join
(
lst
)
obj_cnt
=
''
.
join
(
lst
)
normalizer
=
Normalizer
()
normalizer
=
Normalizer
()
pars_tokenizer
=
Tokenizer
()
pars_tokenizer
=
Tokenizer
()
my_stemmer
=
FindStems
()
normal_txt
=
normalizer
.
normalize
(
obj_cnt
)
normal_txt
=
normalizer
.
normalize
(
obj_cnt
)
tokens
=
pars_tokenizer
.
tokenize_words
(
normal_txt
)
tokens
=
pars_tokenizer
.
tokenize_words
(
normal_txt
)
output
=
dict
()
output
=
dict
()
index
=
0
index
=
0
tokens
=
normalize
(
tokens
)
for
word
in
tokens
:
for
word
in
tokens
:
word
=
my_stemmer
.
convert_to_stem
(
word
)
if
word
not
in
output
.
keys
():
if
word
not
in
output
.
keys
():
output
[
str
(
word
)]
=
[
index
]
output
[
str
(
word
)]
=
[
index
]
else
:
else
:
...
@@ -42,17 +58,22 @@ def token_maker(filename):
...
@@ -42,17 +58,22 @@ def token_maker(filename):
tk_fl
.
write
(
str_out
)
tk_fl
.
write
(
str_out
)
def
get_file_test
(
files
):
def
__starter
(
upper_bound
,
files
):
counter
=
0
counter
=
0
pre_percent
=
0
print
(
"0
%
is tokenized!"
)
for
f
in
files
:
for
f
in
files
:
new_percent
=
int
((
counter
/
upper_bound
)
*
100
)
token_maker
(
str
(
f
))
token_maker
(
str
(
f
))
print
(
f
"{int((counter / 23) * 100)}
%
is tokenized!"
)
if
new_percent
!=
pre_percent
:
print
(
f
"{new_percent}
%
is tokenized!"
)
pre_percent
=
new_percent
counter
=
counter
+
1
counter
=
counter
+
1
def
get_file_test
(
files
):
__starter
(
23
,
files
)
def
get_file
(
files
):
def
get_file
(
files
):
counter
=
0
__starter
(
12201
,
files
)
for
f
in
files
:
token_maker
(
str
(
f
))
print
(
f
"{int((counter / 12201) * 100)}
%
is tokenized!"
)
counter
=
counter
+
1
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment