* meta.jsonl - JSON lines file containing meta data related to the books
* full_texts.zip - zipped folder containing 7668 extracted full texts of the books in separate .txt files.
* morpohological_data.zip - zipped folder containing 7667 JSON lines files. Each file is splitted into n rows, where n = number of pages extracted from the book and each row contains morphological information for the specific page of the book.

LINKING THE DATA:

Each entry of meta.jsonl contains field "id", which corresponds to one file in /full_texts and one file in /morphological_data.
For example:

id = "b2928787x" (retrieved from a row in meta.jsonl)
corresponding file in "/full_texts" = "b2928787x.txt"
corresponding file in "/"morphological_data" = "b2928787x.jsonl"


########################## META ##########################

file name: "meta.jsonl"
contains: 7668 rows x 23 columns

* description: short description of the field
* MARC ID: corresponding MARC field's ID
* N/A identifier: notation used for denoting empty fields
* field type: the data type used for storing the content of the field
* example: an example of the field's content

fields:

	author.name:

		description: name of the author
		MARC ID: 100_a
		N/A identifier: "N/A"
		field type: string
		example: "Marja Kallasmaa"

	author.year_of_birth:

		description: author's year of birth
		MARC ID: 100_d
		N/A identifier: "N/A"
		field type: string
		example: "1912"


	author.year_of_death:

		description: author's year of birth
		MARC ID: 100_d
		N/A identifier: "N/A"
		field type: string
		example: "1997"


	digar_id:

		description: document ID in DIGAR (https://www.digar.ee/)
		MARC ID: -
		N/A identifier: "N/A"
		field type: string
		example: "171981"


	digar_url:

		description: URL corresponding to the document in DIGAR (https://www.digar.ee/)
		MARC ID: -
		N/A identifier: "N/A"
		field type: string
		example: "https://www.digar.ee/arhiiv/nlib-digar:171981"


	id:
		description: unique identifier of the book
		MARC ID: -
		N/A identifier: "N/A"
		field type: string
		example: "b2928787x"

	file_name:

		description: name of the file containing the full text of the document
		MARC ID: -
		N/A identifier: "N/A"
		field type: string
		example: "b2928787x.txt"


	keywords.ems_subject_field:

		description: EMS subject fields corresponding to the document
		MARC ID: -
		N/A identifier: []
		field type: list
		example: ['TÖÖTINGIMUSED. TÖÖHÕIVE. AMETID', 'MAJANDUS. MAJANDUSTEADUS. RAHANDUS. KAUBANDUS']


	keywords.genre:

		description: genre and/or form keywords
		MARC ID: 655
		N/A identifier: []
		field type: list
		example: ['teatmikud', 'ametlikud väljaanded']


	keywords.time:

		description: time keywords
		MARC ID: 653
		N/A identifier: []
		field type: list
		example: ['19. saj']

	keywords.topic:

		description: topic keywords
		MARC ID: 650
		N/A identifier: []
		field type: list
		example: ['struktuurifondid', 'tööhõivepoliitika', 'tööjõud', 'sotsiaalne areng']


	keywords.location:

		description: location keywords
		MARC ID: 651
		N/A identifier: []
		field type: list
		example: ['Eesti', 'Võrumaa']


	keywords.organisation:

		description: organisation keywords
		MARC ID: 610
		N/A identifier: []
		field type: list
		example: ['Euroopa Liit', 'Euroopa Sotsiaalfond']


	keywords.person:

		description: person keywords
		MARC ID: 600
		N/A identifier: []
		field type: list
		example: ['Heiki Kelp']


	keywords.temp_organisation_event:

		description: temporary collective (organisation) or event keywords
		MARC ID: 611
		N/A identifier: []
		field type: list
		example: ['Euroopa Liiduga liitumise mõju Eesti majanduspoliitikale, teadus- ja koolituskonverents']


	langs.present:

		description: language(s) in the document as ISO 639-2 codes (whitespace separated)
		MARC ID: 041_h
		N/A identifier:
		field type: string
		example: "est"


	langs.source:

		description: original language(s) of the book as as ISO 639-2 codes (whitespace separated)
		MARC ID: 041_a
		N/A identifier: "N/A"
		field type: string
		example: "eng"


	langs.summary:

		description: language(s) of the summary/summaries as as ISO 639-2 codes (whitespace separated)
		MARC ID: 041_b
		N/A identifier: "N/A"
		field type: string
		example: "eng"


	n_pages:

		description: number of pages in the book
		MARC ID: 300_a
		N/A identifier: "N/A"
		field type: string
		example: "8"


	publication_place:

		description:  place of publication
		MARC ID: 260_a
		N/A identifier: "N/A"
		field type: string
		example: "Luxembourg"


	publisher:

		description: publisher of the book
		MARC ID: 260_b
		N/A identifier: "N/A"
		field type: string
		example: "Euroopa Liidu Väljaannete Talitus"

	title:

		description: title of the book
		MARC ID: 245
		N/A identifier: "N/A"
		field type: string
		example: "Euroopa Sotsiaalfond : investeerides inimestesse : mis see on ja millega tegeleb?"


	year_published:

		description: year the specific edition of the book was published
		MARC ID: 260_c
		N/A identifier: "N/A"
		field type: string
		example: "2012"


############## MORPHOLOGICAL INFORMATION #################

file_name: "morphological_data.zip"
contains: 7667 JSON lines files (each file contains n rows x 11 columns, where n = the number of extracted pages)

fields:

	id:

		description: unique identifier of the source book, which can be used for linking the data with both meta and full text
		N/A identifier: "N/A"
		field type: string
		example: "b25045507"


	text:

		description: raw text extracted from the page
		N/A identifier: ""
		field type: string


	text.endings:

		description: word endings extracted with EstNLTK (only for texts in Estonian)
		N/A identifier: ""
		field type: string


	text.forms:

		description: word forms extracted with EstNLTK (only for texts in Estonian)
		N/A identifier: ""
		field type: string
		example: ""


	text.lang:

		description: language of the text in the page in ISO 639-1 format (NB! only three languages are supported: ["et", "en", "ru"], so the results are not 100% accurate).
		N/A identifier: ""
		field type: string


	text.lemmas:

		description: lemmas extracted with EstNLTK or spaCy
		N/A identifier: ""
		field type: string


	text.parsing_status:

		description: indicator showing whether the text extraction was successful or not.
			possible_values:
				ok: text extraction was successful
				empty/missing: no text was extracted
				gibberish: the extracted text consists of meaningless symbol sequences
		N/A identifier: "missing"
		field type: string
		example: "ok"

	text.postags:

		description: list of part-of-speech tags extracted with EstNLTK (only for texts in Estonian)
		N/A identifier: ""
		field type: string


	text.tokens:

		description: list of word tokens extracted with EstNLTK or spaCy
		N/A identifier: ""
		field type: string


	pages.current:

		description: number of the current page
		N/A identifier: "N/A"
		field type: string
		example: "8"


	pages.total:

		description: total number of extracted pages
		N/A identifier: "N/A"
		field type: string
		example: "53"


####################### FULL TEXTS #######################

file_name: "full_texts.zip"
contains: 7668 .txt files