Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
F
Farse news IR project
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
9831029
Farse news IR project
Commits
51d4a6fd
Commit
51d4a6fd
authored
2 years ago
by
Haj Rezvan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Optimized the functions.
parent
88bb2e3d
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
40 additions
and
33 deletions
+40
-33
index_maker.py
index_maker.py
+27
-23
split_document.py
split_document.py
+13
-10
No files found.
index_maker.py
View file @
51d4a6fd
import
json
import
json
import
os
import
os
import
threading
def
logs
(
cnt
,
i
):
def
logs
(
cnt
,
i
):
...
@@ -14,30 +13,35 @@ def index():
...
@@ -14,30 +13,35 @@ def index():
index_file
=
open
(
f
"./index/ii.json"
,
"w"
,
encoding
=
"utf-8"
)
index_file
=
open
(
f
"./index/ii.json"
,
"w"
,
encoding
=
"utf-8"
)
invert_index
=
dict
()
invert_index
=
dict
()
os
.
chdir
(
"./document tokens"
)
os
.
chdir
(
"./document tokens"
)
for
tk
in
os
.
listdir
():
files
=
os
.
listdir
()
print
(
tk
)
counter
=
0
pre_percent
=
0
print
(
"0
%
is Index made!"
)
for
tk
in
files
:
token_file
=
open
(
f
"./{tk}"
,
"r"
,
encoding
=
"utf-8"
)
token_file
=
open
(
f
"./{tk}"
,
"r"
,
encoding
=
"utf-8"
)
backup
=
token_file
.
readline
()
try
:
try
:
tkn
=
json
.
load
(
token_file
)
tkn
=
json
.
load
(
token_file
)
except
(
json
.
decoder
.
JSONDecodeError
,
Exception
)
as
e
:
# backup = token_file.readline()
thread
=
threading
.
Thread
(
target
=
logs
,
args
=
(
backup
,
tk
,))
tkn
=
dict
(
tkn
)
thread
.
start
()
tk
=
tk
.
replace
(
".json"
,
""
)
tkn
=
dict
(
tkn
)
tk
=
int
(
tk
)
tk
=
tk
.
replace
(
".json"
,
""
)
tk
=
int
(
tk
)
for
key
in
tkn
.
keys
():
if
key
not
in
invert_index
.
keys
():
for
key
in
tkn
.
keys
():
invert_index
[
str
(
key
)]
=
[
tk
]
if
key
not
in
invert_index
.
keys
():
else
:
invert_index
[
str
(
key
)]
=
[
tk
]
indexes
=
invert_index
[
key
]
else
:
indexes
.
append
(
tk
)
indexes
=
invert_index
[
key
]
indexes
.
sort
()
indexes
.
append
(
tk
)
token_file
.
close
()
indexes
.
sort
()
token_file
.
close
()
new_percent
=
int
((
counter
/
len
(
files
))
*
100
)
if
new_percent
!=
pre_percent
:
# str_out = str(invert_index)
print
(
f
"{new_percent}
%
is Index made!"
)
# str_out = str_out.replace("\'", "\"")
pre_percent
=
new_percent
# str_out = dict(str_out)
counter
=
counter
+
1
except
Exception
as
e
:
print
(
f
"Exception in file {tk}
\n
{e.args}
\n
"
)
json
.
dump
(
invert_index
,
index_file
)
json
.
dump
(
invert_index
,
index_file
)
print
(
"Invert index made!"
)
print
(
"Invert index made!"
)
This diff is collapsed.
Click to expand it.
split_document.py
View file @
51d4a6fd
...
@@ -38,17 +38,20 @@ def __retrieval(data, i, tag):
...
@@ -38,17 +38,20 @@ def __retrieval(data, i, tag):
return
obj_cnt
return
obj_cnt
except
Exception
as
ignore
:
except
Exception
as
ignore
:
print
(
f
"We have a Exception!! {ignore.with_traceback}"
)
print
(
f
"We have a Exception!! {ignore.with_traceback}"
)
pass
def
__processor
(
file
):
def
__processor
(
file
):
data
=
json
.
load
(
file
)
data
=
json
.
load
(
file
)
length
=
len
(
data
)
for
i
in
tqdm
(
range
(
14000
),
desc
=
"Splatted: "
):
doc_ids
=
list
(
data
.
keys
())
title
=
__retrieval
(
data
,
i
,
"title"
)
for
i
in
tqdm
(
range
(
length
),
desc
=
"Splatted: "
):
content
=
__retrieval
(
data
,
i
,
"content"
)
doc_id
=
doc_ids
[
i
]
url
=
__retrieval
(
data
,
i
,
"url"
)
title
=
__retrieval
(
data
,
doc_id
,
"title"
)
tags
=
__retrieval
(
data
,
i
,
"tags"
)
content
=
__retrieval
(
data
,
doc_id
,
"content"
)
date
=
__retrieval
(
data
,
i
,
"date"
)
url
=
__retrieval
(
data
,
doc_id
,
"url"
)
category
=
__retrieval
(
data
,
i
,
"category"
)
tags
=
__retrieval
(
data
,
doc_id
,
"tags"
)
date
=
__retrieval
(
data
,
doc_id
,
"date"
)
__writer
(
i
,
title
,
content
,
url
,
tags
,
date
,
category
)
category
=
__retrieval
(
data
,
doc_id
,
"category"
)
__writer
(
doc_id
,
title
,
content
,
url
,
tags
,
date
,
category
)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment