forked from visualpython/visualpython
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathuserCommand.py
More file actions
54 lines (41 loc) · 1.37 KB
/
userCommand.py
File metadata and controls
54 lines (41 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import numpy as np
import fitz
import nltk
nltk.download('punkt')
def vp_pdf_get_sentence(fname_lst):
'''
Get sentence from pdf file by PyMuPDF
'''
df = pd.DataFrame()
for fname in fname_lst:
if fname.split('.')[-1] != 'pdf': continue
try:
doc = fitz.open(fname)
sentence_lst = []
for page in doc:
block_lst = page.get_text('blocks')
text_lst = [block[4] for block in block_lst if block[6] == 0]
text = '\\n'.join(text_lst)
sentence_lst.extend([sentence for sentence in nltk.sent_tokenize(text)])
doc.close()
except Exception as e:
print(e)
continue
df_doc = pd.DataFrame({
'fname': fname.split('/')[-1],
'sentence': sentence_lst
})
df = pd.concat([df,df_doc])
return df.reset_index().drop('index', axis=1)
def vp_drop_outlier(df, col, weight=1.5):
sr = df[col]
q25 = np.percentile(sr.values, 25)
q75 = np.percentile(sr.values, 75)
iqr = q75 - q25
iqr_w = iqr * weight
val_l = q25 - iqr_w
val_h = q75 + iqr_w
outlier_index = sr[(sr < val_l) | (sr > val_h)].index
df_res = df.drop(outlier_index).copy()
return df_res