any2txt|div(20000)|tagger({"lang": "polish"})|
fextor2(
{"features": "base",
"lang": "ud",
"filters": {
"base":[{"type": "pos_stoplist", "args": {"stoplist": ["NOUN"],"excluding": false}}]
}
})|
dir|feature2({
"filter": {
"base": {
"min_df": 2,
"max_df": 1,
"keep_n": 1000
}
}
})|
topic3({"no_topics": 20, "no_passes": 30, "method": "artm_bigartm"})
Plik json z macierzą opisującą częstotliwość występowania poszczególnych słów (wynik komendy feature2
w powyższym zapytaniu):
topic({
"alpha": float (default: 0.1)
"beta": float (default: 0.01)
"distance_map: bool (default: True)
"method": str (opt: [artm_bigartm, lda_mallet], default: artm_bigartm)
"no_topics": int (default: 20)
"no_passes": int (default: 100)
"model": str (default: "")
"topic_reduction": str (default: "tsne")
"doc_reduction": str (default: "tsne")
"topic_clustering": str (default: "ac")
"doc_clustering": str (default: "ac")
"topic_affinity": str (default: "correlation")
"doc_affinity": str (default: "correlation")
"topic_no_clusters": int (default: 5)
"doc_no_clusters": int (default 5)
})
import re
from lpmn_client import download_file, upload_file
from lpmn_client import Task
pipeline = [
"any2txt",
"div(20000)",
'tagger({"lang": "polish"})',
"""
fextor2(
{
"features": "base",
"lang": "ud",
"filters": {
"base": [{
"type": "pos_stoplist",
"args": {
"stoplist": ["NOUN"],
"excluding": false
}
}]
}
}
)
""",
"dir",
"""
feature2(
{
"filter": {
"base": {
"min_df": 2,
"max_df": 1,
"keep_n": 1000
}
}
}
)
""",
"""
topic3(
{
"no_topics": 20,
"no_passes": 30,
"method": "artm_bigartm"
}
)
"""
]
lpmn = "|".join(pipeline)
task = Task(lpmn=re.sub(r"\s*", "", lpmn))
task.email = "example@mail.com" # change e-mail
file_id = upload_file("./test.zip") # zip file with some documents (for example docx files)
output_file_id = task.run(file_id)
download_file(output_file_id, "./out")