In Kitconc, the ‘Corpus’ class is the main object for the creation and manipulation of corpora. For better understanding, the class can be represented as follows:
corpus = Corpus(workspace, corpus_name, language)
For instantiating a Corpus object, you must pass the required values to the following arguments:
It returns a reference to the corpus. The corpus processing is only effective when the ‘add_texts()’ function is executed.
corpus.add_texts(source_folder,**kwargs)
Arguments:
The Corpus contains all the functions and methods for processing the texts (wordlist(), keywords(), kwic() etc.).
See the examples in the following sections for a better understanding.
from kitconc.core import Examples
Examples().download()
from kitconc.kit_corpus import Corpus
# reference to the corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
# add texts from source folder
corpus.add_texts('kitconc-examples/ads',show_progress=True)
from kitconc.kit_corpus import Corpus
# reference to the corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
# make wordlist
wordlist = corpus.wordlist(show_progress=True)
# print the top 10
print(wordlist.df.head(10))
# save Excel file
wordlist.save_excel(corpus.output_path + 'wordlist.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
wordlist = corpus.wordlist(show_progress=True)
keywords = corpus.keywords(wordlist,show_progress=True)
print(keywords.df.head(10))
keywords.save_excel(corpus.output_path + 'keywords.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
kwic = corpus.kwic('experience',show_progress=True)
kwic.sort('R1','R2','R3')
print(kwic.df.head(10))
kwic.save_excel(corpus.output_path + 'kwic.xlsx',highlight='R1 R2 R3')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
concordances = corpus.concordance('experience',show_progress=True)
print(concordances.df.head(10))
concordances.save_excel(corpus.output_path + 'concordances.xlsx',highlight='R1 R2 R3')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
collocates = corpus.collocates('experience',left_span=2,right_span=2,coll_pos='IN NN JJ VBN VBD',show_progress=True)
print(collocates.df.head(10))
collocates.save_excel(corpus.output_path + 'collocates.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
clusters = corpus.clusters('experience',size=3,show_progress=True)
print(clusters.df.head(10))
clusters.save_excel(corpus.output_path + 'clusters.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
ngrams = corpus.ngrams(size=3,pos='NN IN NN',show_progress=True)
print(ngrams.df.head(10))
ngrams.save_excel(corpus.output_path + 'ngrams.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
dispersion = corpus.dispersion('salary')
print(dispersion.df.head(10))
dispersion.save_excel(corpus.output_path + 'dispersion.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
wordlist = corpus.wordlist(show_progress=True)
keywords = corpus.keywords(wordlist,show_progress=True)
keywords_dispersion = corpus.keywords_dispersion(keywords,show_progress=True)
print(keywords_dispersion.df.head(10))
keywords_dispersion.save_excel(corpus.output_path+'keywords_dispersion.xlsx')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
kwic = corpus.kwic('skills',show_progress=True)
collocations = corpus.collocations(kwic,show_progress=True)
print(collocations.df.head(10))
collocations.save_excel(corpus.output_path+'collocations.xlsx')
# plot a collocate distribution
collocations.plot_colldist('strong')
from kitconc.kit_corpus import Corpus
corpus = Corpus('kitconc-examples/workspace','ads','english')
collocates = corpus.collocates('skills',left_span=3,right_span=3,coll_pos='NN JJ',show_progress=True)
print(collocates.df.head(10))
collocates.save_excel(corpus.output_path + 'collocates.xlsx')
# plot collocates
collocates.plot_collgraph(node='skills')